aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp65
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedTSV110.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp425
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h19
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h21
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td19
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h54
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td79
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td54
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td500
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td17
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp56
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td80
-rw-r--r--llvm/lib/Target/Hexagon/CMakeLists.txt3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp274
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp56
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp689
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp32
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp324
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h12
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp30
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp257
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp166
-rw-r--r--llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp31
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td21
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp17
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVISelLowering.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp36
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp19
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp26
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp9
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.h7
69 files changed, 2683 insertions, 1064 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index c62582a..a99856d 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
// name (emitting the definition) can grab it from the metadata.
//
// FIXME: Handle functions with weak linkage?
- if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
+ if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
if (std::optional<std::string> MangledName =
getArm64ECMangledFunctionName(F.getName().str())) {
F.setMetadata("arm64ec_unmangled_name",
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 5b5ffd7..4fa719a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
TS->emitDirectiveVariantPCS(CurrentFnSym);
}
- if (TM.getTargetTriple().isWindowsArm64EC()) {
+ if (TM.getTargetTriple().isWindowsArm64EC() &&
+ !MF->getFunction().hasLocalLinkage()) {
// For ARM64EC targets, a function definition's name is mangled differently
// from the normal symbol. We emit the alias from the unmangled symbol to
// mangled symbol name here.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3485edb..5cc612e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
cl::desc("enable use of redzone on AArch64"),
cl::init(false), cl::Hidden);
-static cl::opt<bool>
- ReverseCSRRestoreSeq("reverse-csr-restore-seq",
- cl::desc("reverse the CSR restore sequence"),
- cl::init(false), cl::Hidden);
-
static cl::opt<bool> StackTaggingMergeSetTag(
"stack-tagging-merge-settag",
cl::desc("merge settag instruction in function epilog"), cl::init(true),
@@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
if (!EnableHomogeneousPrologEpilog)
return false;
- if (ReverseCSRRestoreSeq)
- return false;
if (EnableRedZone)
return false;
@@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+ if (homogeneousPrologEpilog(MF, &MBB)) {
+ auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
+ .setMIFlag(MachineInstr::FrameDestroy);
+ for (auto &RPI : RegPairs) {
+ MIB.addReg(RPI.Reg1, RegState::Define);
+ MIB.addReg(RPI.Reg2, RegState::Define);
+ }
+ return true;
+ }
+
+ // For performance reasons restore SVE register in increasing order
+ auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+ auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+ auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+ std::reverse(PPRBegin, PPREnd);
+ auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+ auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+ auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+ std::reverse(ZPRBegin, ZPREnd);
+
+ for (const RegPairInfo &RPI : RegPairs) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3191,42 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
-
- return MIB->getIterator();
- };
-
- // SVE objects are always restored in reverse order.
- for (const RegPairInfo &RPI : reverse(RegPairs))
- if (RPI.isScalable())
- EmitMI(RPI);
-
- if (homogeneousPrologEpilog(MF, &MBB)) {
- auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
- .setMIFlag(MachineInstr::FrameDestroy);
- for (auto &RPI : RegPairs) {
- MIB.addReg(RPI.Reg1, RegState::Define);
- MIB.addReg(RPI.Reg2, RegState::Define);
- }
- return true;
- }
-
- if (ReverseCSRRestoreSeq) {
- MachineBasicBlock::iterator First = MBB.end();
- for (const RegPairInfo &RPI : reverse(RegPairs)) {
- if (RPI.isScalable())
- continue;
- MachineBasicBlock::iterator It = EmitMI(RPI);
- if (First == MBB.end())
- First = It;
- }
- if (First != MBB.end())
- MBB.splice(MBBI, &MBB, First);
- } else {
- for (const RegPairInfo &RPI : RegPairs) {
- if (RPI.isScalable())
- continue;
- (void)EmitMI(RPI);
- }
}
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 184ebc1..3b92e95 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFPARMv8())
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFPARMv8())
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
@@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
- setOperationAction(ISD::BITCAST, MVT::i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::f16, Custom);
- setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+ if (Subtarget->hasFPARMv8()) {
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+ }
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 436b21f..bec1348 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1308,6 +1308,8 @@ private:
bool preferScalarizeSplat(SDNode *N) const override;
unsigned getMinimumJumpTableEntries() const override;
+
+ bool softPromoteHalfType() const override { return true; }
};
namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 0ae9a69..1c577a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>;
def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>;
def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
-def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 961dded..ef7c517 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -21,7 +21,6 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/StackSafetyAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
for (auto &I : SInfo.AllocasToInstrument) {
memtag::AllocaInfo &Info = I.second;
assert(Info.AI && SIB.isInterestingAlloca(*Info.AI));
- TrackingVH<Instruction> OldAI = Info.AI;
memtag::alignAndPadAlloca(Info, kTagGranuleSize);
AllocaInst *AI = Info.AI;
int Tag = NextTag;
@@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
ConstantInt::get(IRB.getInt64Ty(), Tag)});
if (Info.AI->hasName())
TagPCall->setName(Info.AI->getName() + ".tag");
- Info.AI->replaceAllUsesWith(TagPCall);
+ // Does not replace metadata, so we don't have to handle DPValues.
+ Info.AI->replaceNonMetadataUsesWith(TagPCall);
TagPCall->setOperand(0, Info.AI);
// Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
for (auto *II : Info.LifetimeEnd)
II->eraseFromParent();
}
-
- // Fixup debug intrinsics to point to the new alloca.
- for (auto *DVI : Info.DbgVariableIntrinsics)
- DVI->replaceVariableLocationOp(OldAI, Info.AI);
- for (auto *DPV : Info.DbgVariableRecords)
- DPV->replaceVariableLocationOp(OldAI, Info.AI);
}
// If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6655931..010e569 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info);
+ case ISD::FREM:
+ // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
+ // those functions are not declared in the module.
+ if (!Ty->isVectorTy())
+ return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2..9218760 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
+ (defs root:$root),
+ (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)),
+ (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root,
+ [{ return matchFDivSqrtToRsqF16(*${root}); }]),
+ (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
@@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+ rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0d3b158..13d7510 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
return true;
}
-static const unsigned SPDenormModeBitField =
- AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+static constexpr unsigned SPDenormModeBitField =
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e9..82e17dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,9 @@ public:
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
+ bool matchFDivSqrtToRsqF16(MachineInstr &MI) const;
+ void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const;
+
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
@@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
+ MachineInstr &MI) const {
+ Register Sqrt = MI.getOperand(2).getReg();
+ return MRI.hasOneNonDBGUse(Sqrt);
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
+ MachineInstr &MI, const Register &X) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Y = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ uint32_t Flags = MI.getFlags();
+ Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+ .addUse(X)
+ .setMIFlags(Flags)
+ .getReg(0);
+ B.buildFMul(Dst, RSQ, Y, Flags);
+ MI.eraseFromParent();
+}
+
bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5b32b34..b7b471d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
if (trySkipId("hwreg", AsmToken::LParen)) {
OperandInfoTy HwReg(OPR_ID_UNKNOWN);
- OperandInfoTy Offset(OFFSET_DEFAULT_);
- OperandInfoTy Width(WIDTH_DEFAULT_);
+ OperandInfoTy Offset(HwregOffset::Default);
+ OperandInfoTy Width(HwregSize::Default);
if (parseHwregBody(HwReg, Offset, Width) &&
validateHwreg(HwReg, Offset, Width)) {
- ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+ ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
} else {
return ParseStatus::Failure;
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 894607d..e1cca17 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
}
+static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeDpp8FI(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \
uint64_t /*Addr*/, \
@@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
return DecoderUInt128(Lo, Hi);
}
-// The disassembler is greedy, so we need to check FI operand value to
-// not parse a dpp if the correct literal is not set. For dpp16 the
-// autogenerated decoder checks the dpp literal
-static bool isValidDPP8(const MCInst &MI) {
- using namespace llvm::AMDGPU::DPP;
- int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
- assert(FiIdx != -1);
- if ((unsigned)FiIdx >= MI.getNumOperands())
- return false;
- unsigned Fi = MI.getOperand(FiIdx).getImm();
- return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
-}
-
DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
@@ -460,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
Bytes = Bytes_.slice(0, MaxInstBytesNum);
- DecodeStatus Res = MCDisassembler::Fail;
+ // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
+ // there are fewer bytes left). This will be overridden on success.
+ Size = std::min((size_t)4, Bytes_.size());
+
do {
// ToDo: better to switch encoding length using some bit predicate
// but it is unknown yet, so try all we can
@@ -469,222 +465,147 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
- Res =
- tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
- MI, DecW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
- Res =
- tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
- MI, DecW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- const auto convertVOPDPP = [&]() {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
- convertVOP3PDPPInst(MI);
- } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
- convertVOPCDPPInst(MI); // Special VOP3 case
- } else {
- assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
- convertVOP3DPPInst(MI); // Regular VOP3 case
- }
- };
- Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
- MI, DecW, Address, CS);
- if (Res) {
- convertVOPDPP();
- break;
- }
- Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
- MI, DecW, Address, CS);
- if (Res) {
- convertVOPDPP();
- break;
- }
- Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
- if (Res)
+
+ if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+ DecW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+ DecW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
break;
}
+
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
- if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
- Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
- if (Res) {
- if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
- == -1)
- break;
- if (convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
- }
- }
-
- Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- Res = tryDecodeInst(DecoderTableDPP8GFX1164,
- DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- Res = tryDecodeInst(DecoderTableDPP8GFX1264,
- DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+ tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
break;
- MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
- if (Res) break;
-
- Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
- MI, QW, Address, CS);
- if (Res) {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
- convertVOPCDPPInst(MI);
- break;
- }
-
- Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
- MI, QW, Address, CS);
- if (Res) {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
- convertVOPCDPPInst(MI);
+ if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
+ tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
break;
- }
-
- if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
- Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
- if (Res)
- break;
- }
// Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
// v_mad_mixhi_f16 for FMA variants. Try to decode using this special
// table first so we print the correct name.
- if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
- Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
+ tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
- Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
+ tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
- Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+ tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI,
- QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+ Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI,
- QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+ Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
break;
}
- // Reinitialize Bytes as DPP64 could have eaten too much
+ // Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
// Try decode 32-bit instruction
- if (Bytes.size() < 4) break;
- const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
- if (Res) break;
+ if (Bytes.size() >= 4) {
+ const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
- Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
break;
- }
- if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
- Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
- if (Res) break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+ tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
+ break;
+
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+ tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
- Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+ Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
- Address, CS);
+ if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+ Address, CS))
+ break;
+ }
+
+ return MCDisassembler::Fail;
} while (false);
- if (Res && AMDGPU::isMAC(MI.getOpcode())) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
+ if (isMacDPP(MI))
+ convertMacDPPInst(MI);
+
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+ convertVOP3PDPPInst(MI);
+ else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
+ AMDGPU::isVOPC64DPP(MI.getOpcode()))
+ convertVOPCDPPInst(MI); // Special VOP3 case
+ else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
+ -1)
+ convertDPP8Inst(MI);
+ else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
+ convertVOP3DPPInst(MI); // Regular VOP3 case
+ }
+
+ if (AMDGPU::isMAC(MI.getOpcode())) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
- if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+ if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
!AMDGPU::hasGDS(STI)) {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::cpol);
if (CPolPos != -1) {
@@ -700,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
- (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
+ if ((MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+ (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
// GFX90A lost TFE, its place is occupied by ACC.
int TFEOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
@@ -713,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
int SWZOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
if (SWZOpIdx != -1) {
@@ -724,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int RsrcIdx =
@@ -732,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
if (VAddr0Idx >= 0 && NSAArgs > 0) {
unsigned NSAWords = (NSAArgs + 3) / 4;
- if (Bytes.size() < 4 * NSAWords) {
- Res = MCDisassembler::Fail;
- } else {
- for (unsigned i = 0; i < NSAArgs; ++i) {
- const unsigned VAddrIdx = VAddr0Idx + 1 + i;
- auto VAddrRCID =
- MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
- MI.insert(MI.begin() + VAddrIdx,
- createRegOperand(VAddrRCID, Bytes[i]));
- }
- Bytes = Bytes.slice(4 * NSAWords);
+ if (Bytes.size() < 4 * NSAWords)
+ return MCDisassembler::Fail;
+ for (unsigned i = 0; i < NSAArgs; ++i) {
+ const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+ auto VAddrRCID =
+ MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
+ MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
}
+ Bytes = Bytes.slice(4 * NSAWords);
}
- if (Res)
- Res = convertMIMGInst(MI);
+ convertMIMGInst(MI);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
- Res = convertMIMGInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
+ convertMIMGInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
- Res = convertEXPInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
+ convertEXPInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
- Res = convertVINTERPInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
+ convertVINTERPInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA))
- Res = convertSDWAInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
+ convertSDWAInst(MI);
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
@@ -782,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
int ImmLitIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
- if (Res && ImmLitIdx != -1 && !IsSOPK)
- Res = convertFMAanyK(MI, ImmLitIdx);
+ if (ImmLitIdx != -1 && !IsSOPK)
+ convertFMAanyK(MI, ImmLitIdx);
- // if the opcode was not recognized we'll assume a Size of 4 bytes
- // (unless there are fewer bytes left)
- Size = Res ? (MaxInstBytesNum - Bytes.size())
- : std::min((size_t)4, Bytes_.size());
- return Res;
+ Size = MaxInstBytesNum - Bytes.size();
+ return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
// The MCInst still has these fields even though they are no longer encoded
// in the GFX11 instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
}
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
@@ -815,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
// instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
}
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
STI.hasFeature(AMDGPU::FeatureGFX10)) {
if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
@@ -835,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
}
}
- return MCDisassembler::Success;
}
struct VOPModifiers {
@@ -939,56 +850,40 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
AMDGPU::OpName::src2_modifiers);
}
-// We must check FI == literal to reject not genuine dpp8 insts, and we must
-// first add optional MI operands to check FI
-DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
- if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
- convertVOP3PDPPInst(MI);
- } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
- AMDGPU::isVOPC64DPP(Opc)) {
- convertVOPCDPPInst(MI);
- } else {
- if (isMacDPP(MI))
- convertMacDPPInst(MI);
+ int VDstInIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+ if (VDstInIdx != -1)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
- int VDstInIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
- if (VDstInIdx != -1)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
- if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ convertTrue16OpSel(MI);
+ auto Mods = collectVOPModifiers(MI);
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+ AMDGPU::OpName::op_sel);
+ } else {
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
- unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
- convertTrue16OpSel(MI);
- auto Mods = collectVOPModifiers(MI);
- insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
- AMDGPU::OpName::op_sel);
- } else {
- // Insert dummy unused src modifiers.
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src0_modifiers);
-
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src1_modifiers);
- }
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
}
- return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
}
-DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
- if (isMacDPP(MI))
- convertMacDPPInst(MI);
-
+void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
convertTrue16OpSel(MI);
int VDstInIdx =
@@ -1008,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
AMDGPU::OpName::op_sel);
}
- return MCDisassembler::Success;
}
// Note that before gfx10, the MIMG encoding provided no information about
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
-DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -1043,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (BaseOpcode->BVH) {
// Add A16 operand for intersect_ray instructions
addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
- return MCDisassembler::Success;
+ return;
}
bool IsAtomic = (VDstIdx != -1);
@@ -1078,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
// The NSA encoding does not contain enough operands for the
// combination of base opcode / dimension. Should this be an error?
- return MCDisassembler::Success;
+ return;
}
IsPartialNSA = true;
}
@@ -1097,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
DstSize += 1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
- return MCDisassembler::Success;
+ return;
int NewOpcode =
AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
if (NewOpcode == -1)
- return MCDisassembler::Success;
+ return;
// Widen the register to the correct number of enabled channels.
unsigned NewVdata = AMDGPU::NoRegister;
@@ -1119,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (NewVdata == AMDGPU::NoRegister) {
// It's possible to encode this such that the low register + enabled
// components exceeds the register count.
- return MCDisassembler::Success;
+ return;
}
}
@@ -1137,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
&MRI.getRegClass(AddrRCID));
if (!NewVAddrSA)
- return MCDisassembler::Success;
+ return;
}
MI.setOpcode(NewOpcode);
@@ -1158,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
MI.erase(MI.begin() + VAddr0Idx + AddrSize,
MI.begin() + VAddr0Idx + Info->VAddrDwords);
}
-
- return MCDisassembler::Success;
}
// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
// decoder only adds to src_modifiers, so manually add the bits to the other
// operands.
-DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
auto Mods = collectVOPModifiers(MI, true);
@@ -1190,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
AMDGPU::OpName::neg_hi);
-
- return MCDisassembler::Success;
}
// Create dummy old operand and insert optional operands
-DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
@@ -1212,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src1_modifiers);
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
- int ImmLitIdx) const {
+void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
assert(HasLiteral && "Should have decoded a literal");
const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
unsigned DescNumOps = Desc.getNumOperands();
@@ -1232,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
IsDeferredOp)
Op.setImm(Literal);
}
- return MCDisassembler::Success;
}
const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
@@ -1831,6 +1718,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
return decodeSrcOp(OPW32, Val);
}
+MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
+ if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
+ return MCOperand();
+ return MCOperand::createImm(Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3142b8a..2e1b6fb 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -194,15 +194,15 @@ public:
DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer,
raw_string_ostream &KdStream) const;
- DecodeStatus convertEXPInst(MCInst &MI) const;
- DecodeStatus convertVINTERPInst(MCInst &MI) const;
- DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
- DecodeStatus convertSDWAInst(MCInst &MI) const;
- DecodeStatus convertDPP8Inst(MCInst &MI) const;
- DecodeStatus convertMIMGInst(MCInst &MI) const;
- DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
- DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
- DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
+ void convertEXPInst(MCInst &MI) const;
+ void convertVINTERPInst(MCInst &MI) const;
+ void convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
+ void convertSDWAInst(MCInst &MI) const;
+ void convertDPP8Inst(MCInst &MI) const;
+ void convertMIMGInst(MCInst &MI) const;
+ void convertVOP3DPPInst(MCInst &MI) const;
+ void convertVOP3PDPPInst(MCInst &MI) const;
+ void convertVOPCDPPInst(MCInst &MI) const;
void convertMacDPPInst(MCInst &MI) const;
void convertTrue16OpSel(MCInst &MI) const;
@@ -261,6 +261,7 @@ public:
MCOperand decodeBoolReg(unsigned Val) const;
MCOperand decodeSplitBarrier(unsigned Val) const;
+ MCOperand decodeDpp8FI(unsigned Val) const;
int getTTmpIdx(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a727134..00fa93c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) {
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
- return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+ return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
}
ScheduleHazardRecognizer::HazardType
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a45fea6..a32be1e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- unsigned Id;
- unsigned Offset;
- unsigned Width;
-
using namespace llvm::AMDGPU::Hwreg;
unsigned Val = MI->getOperand(OpNo).getImm();
- decodeHwreg(Val, Id, Offset, Width);
+ auto [Id, Offset, Width] = HwregEncoding::decode(Val);
StringRef HwRegName = getHwreg(Id, STI);
O << "hwreg(";
@@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
} else {
O << Id;
}
- if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
+ if (Width != HwregSize::Default || Offset != HwregOffset::Default)
O << ", " << Offset << ", " << Width;
- }
O << ')';
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 98310c3..0b516bf 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0]
ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
-
- ID_SHIFT_ = 0,
- ID_WIDTH_ = 6,
- ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
};
enum Offset : unsigned { // Offset, (5) [10:6]
- OFFSET_DEFAULT_ = 0,
- OFFSET_SHIFT_ = 6,
- OFFSET_WIDTH_ = 5,
- OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
-
OFFSET_MEM_VIOL = 8,
};
-enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
- WIDTH_M1_DEFAULT_ = 31,
- WIDTH_M1_SHIFT_ = 11,
- WIDTH_M1_WIDTH_ = 5,
- WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
-};
-
-// Some values from WidthMinusOne mapped into Width domain.
-enum Width : unsigned {
- WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
-};
-
enum ModeRegisterMasks : uint32_t {
FP_ROUND_MASK = 0xf << 0, // Bits 0..3
FP_DENORM_MASK = 0xf << 4, // Bits 4..7
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee7..4f106bf 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
- addReg(FlatScrInitLo).
- addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
- (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
- addReg(FlatScrInitHi).
- addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
- (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ using namespace AMDGPU::Hwreg;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+ .addReg(FlatScrInitLo)
+ .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+ .addReg(FlatScrInitHi)
+ .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 257dff6..d8f528d8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
assert(Op.getValueType() == MVT::i32);
uint32_t BothRoundHwReg =
- AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
SDValue IntrinID =
@@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock::iterator I = LoopBB->end();
- const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
- AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+ const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
// Clear TRAP_STS.MEM_VIOL
BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
@@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// Otherwise there was overflow and the result is hi2:0. In both cases the
// result should represent the actual time at some point during the sequence
// of three getregs.
+ using namespace AMDGPU::Hwreg;
Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
- .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
- 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
- .addImm(
- AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
- .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
- 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(RegHi1)
.addReg(RegHi2);
@@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// FIXME: This could be predicates on the immediate, but tablegen doesn't
// allow you to have a no side effect instruction in the output of a
// sideeffecting pattern.
- unsigned ID, Offset, Width;
- AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+ auto [ID, Offset, Width] =
+ AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
if (ID != AMDGPU::Hwreg::ID_MODE)
return BB;
@@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
DenominatorScaled, Flags);
- const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
- (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+ using namespace AMDGPU::Hwreg;
+ const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
const MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8..a6184c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -480,6 +480,10 @@ public:
// WaitEventType to corresponding counter values in InstCounterType.
virtual const unsigned *getWaitEventMask() const = 0;
+ // Returns a new waitcnt with all counters except VScnt set to 0. If
+ // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
virtual ~WaitcntGenerator() = default;
};
@@ -516,6 +520,8 @@ public:
return WaitEventMaskForInstPreGFX12;
}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
@@ -549,6 +555,8 @@ public:
return WaitEventMaskForInstGFX12Plus;
}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
return Modified;
}
+AMDGPU::Waitcnt
+WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+AMDGPU::Waitcnt
+WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
/// were added by previous passes. Currently this pass conservatively
@@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(
- AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+ Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
@@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
- Wait = Wait.combined(
- AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
+ Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+ Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
if (ForceEmitWaitcnt[LOAD_CNT])
Wait.LoadCnt = 0;
@@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
ScoreBrackets->applyWaitcnt(
- AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+ WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else {
// May need to way wait for anything.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 97c7237..34cdb09 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC {
}
class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
- string ConvertMethod = "nullptr">
- : CustomOperand<Type, Optional, NAME> {
+ string name = NAME, string ConvertMethod = "nullptr">
+ : CustomOperand<Type, Optional, name> {
let ParserMethod =
"[this](OperandVector &Operands) -> ParseStatus { "#
"return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in {
def DppRowMask : NamedIntOperand<i32, "row_mask">;
def DppBankMask : NamedIntOperand<i32, "bank_mask">;
}
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1,
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
"[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
-def DppFI : NamedIntOperand<i32, "fi">;
+
+let DecoderMethod = "decodeDpp8FI" in
+def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
def blgp : CustomOperand<i32, 1, "BLGP">;
def CBSZ : NamedIntOperand<i32, "cbsz">;
@@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
- (ins DppFI:$fi));
+ (ins Dpp16FI:$fi));
}
class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
- (ins dpp8:$dpp8, DppFI:$fi));
+ (ins dpp8:$dpp8, Dpp8FI:$fi));
}
class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
@@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has
class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
- (ins DppFI:$fi));
+ (ins Dpp16FI:$fi));
}
class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
- (ins dpp8:$dpp8, DppFI:$fi));
+ (ins dpp8:$dpp8, Dpp8FI:$fi));
}
// Ins for SDWA
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index e62ad02..c01b126 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask);
unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset);
unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+ using namespace AMDGPU::Hwreg;
BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
.addImm(Value)
- .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
- (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+ .addImm(HwregEncoding::encode(ID_MODE, Offset, Width));
++NumSetregInserted;
Changed = true;
InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
@@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
// as we assume it has been inserted by a higher authority (this is
// likely to be a very rare occurrence).
unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
- if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
- AMDGPU::Hwreg::ID_MODE)
+ using namespace AMDGPU::Hwreg;
+ auto [Id, Offset, Width] = HwregEncoding::decode(Dst);
+ if (Id != ID_MODE)
continue;
- unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
- AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
- 1;
- unsigned Offset =
- (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset;
// If an InsertionPoint is set we will insert a setreg there.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dacdf7b..ce91e05 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
return (Idx < 0) ? Idx : Opr[Idx].Encoding;
}
-bool isValidHwreg(int64_t Id) {
- return 0 <= Id && isUInt<ID_WIDTH_>(Id);
-}
+bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
bool isValidHwregOffset(int64_t Offset) {
- return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+ return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
}
bool isValidHwregWidth(int64_t Width) {
- return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
-}
-
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
- return (Id << ID_SHIFT_) |
- (Offset << OFFSET_SHIFT_) |
- ((Width - 1) << WIDTH_M1_SHIFT_);
+ return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
}
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
@@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
return (Idx < 0) ? "" : Opr[Idx].Name;
}
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
- Id = (Val & ID_MASK_) >> ID_SHIFT_;
- Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
- Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
-}
-
} // namespace Hwreg
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f35e7744..6826cd2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
} // end namespace IsaInfo
+// Represents a field in an encoded value.
+template <unsigned HighBit, unsigned LowBit, unsigned D = 0>
+struct EncodingField {
+ static_assert(HighBit >= LowBit, "Invalid bit range!");
+ static constexpr unsigned Offset = LowBit;
+ static constexpr unsigned Width = HighBit - LowBit + 1;
+
+ using ValueType = unsigned;
+ static constexpr ValueType Default = D;
+
+ ValueType Value;
+ constexpr EncodingField(ValueType Value) : Value(Value) {}
+
+ constexpr uint64_t encode() const { return Value; }
+ static ValueType decode(uint64_t Encoded) { return Encoded; }
+};
+
+// A helper for encoding and decoding multiple fields.
+template <typename... Fields> struct EncodingFields {
+ static constexpr uint64_t encode(Fields... Values) {
+ return ((Values.encode() << Values.Offset) | ...);
+ }
+
+ static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) {
+ return {Fields::decode((Encoded >> Fields::Offset) &
+ maxUIntN(Fields::Width))...};
+ }
+};
+
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
@@ -870,15 +899,6 @@ struct Waitcnt {
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
- static Waitcnt allZero(bool Extended, bool HasStorecnt) {
- return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
- : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
- }
-
- static Waitcnt allZeroExceptVsCnt(bool Extended) {
- return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
- }
-
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
@@ -1030,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
namespace Hwreg {
+using HwregId = EncodingField<5, 0>;
+using HwregOffset = EncodingField<10, 6>;
+
+struct HwregSize : EncodingField<15, 11, 32> {
+ using EncodingField::EncodingField;
+ constexpr uint64_t encode() const { return Value - 1; }
+ static ValueType decode(uint64_t Encoded) { return Encoded + 1; }
+};
+
+using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
+
LLVM_READONLY
int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
@@ -1043,13 +1074,8 @@ LLVM_READNONE
bool isValidHwregWidth(int64_t Width);
LLVM_READNONE
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
-
-LLVM_READNONE
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
-
} // namespace Hwreg
namespace DepCtr {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 99f8e8e..f5424cf 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
let OutsDPP = (outs Src0RC32:$vdst);
let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
- let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi);
+ let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi);
let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret;
let OutsVOP3DPP = (outs Src0RC64:$vdst);
@@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p
class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
}
//===----------------------------------------------------------------------===//
@@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16,
- DecoderNamespace = "DPP" # Gen.DecoderNamespace #
+ DecoderNamespace = Gen.DecoderNamespace #
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
defm NAME : VOP1_Real_dpp<Gen, op, opName>;
}
@@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8,
- DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
+ DecoderNamespace = Gen.DecoderNamespace #
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
}
@@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
- def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "DPP8";
- }
+ def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
@@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
let Inst{31-25} = 0x3f; //encoding
}
-multiclass VOP1Only_Real_vi <bits<10> op> {
- let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+ multiclass VOP1Only_Real_vi <bits<10> op> {
def _vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
}
-}
-multiclass VOP1_Real_e32e64_vi <bits<10> op> {
- let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -1389,44 +1385,41 @@ def : GCNPat <
// GFX9
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_gfx9 <bits<10> op> {
- let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
- }
-
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
-
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx9 :
- VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
-
-}
-multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
- let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
- defm NAME : VOP1_Real_e32e64_vi <op>;
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
}
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
- let Inst{42-40} = 6;
- }
+ multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx9 :
- VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let Inst{42-40} = 6;
+ }
+
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+ }
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
-let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+let AssemblerPredicate = isGFX940Plus in
defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
let OtherPredicates = [HasFP8ConversionInsts] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 4437d5f..13fe79b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
@@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Src2Mod = FP32InputMods; // dummy unused modifiers
let Src2RC64 = VGPRSrc_32; // stub argument
}
@@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
Src0DPP:$src0,
Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let OutsVOP3DPP = Outs64;
@@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1>
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
Src0DPP:$src0,
Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
@@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
FPVRegInputMods:$src0_modifiers, Src0DPP:$src0,
FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Src0ModVOP3DPP = FPVRegInputMods;
let Src1ModVOP3DPP = FPVRegInputMods;
@@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen,
VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
VOP2_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
- def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "DPP8";
- }
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
}
//===------------------------- VOP2 (with name) -------------------------===//
@@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8";
}
}
@@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
- let DecoderNamespace = "DPP8";
}
if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_w32_gfx10 :
@@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
-let AssemblerPredicate = isGFX8Only in {
+let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
def _e32_vi :
@@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX8";
}
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX8";
}
if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
@@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
let AsmString = AsmName # ps.AsmOperands;
}
}
-}
-let AssemblerPredicate = isGFX9Only in {
+} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8"
+
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
def _e32_gfx9 :
@@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
def _e64_gfx9 :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
@@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
}
multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
def _e32_gfx9 :
VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
- let DecoderNamespace = "GFX9";
- }
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
def _e64_gfx9 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
- VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- let DecoderNamespace = "GFX9";
- }
+ VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
- let DecoderNamespace = "GFX9";
- }
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
-} // AssemblerPredicate = isGFX9Only
+} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 396ae9c..7198a40 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
VGPR_32:$vdst_in, op_sel0:$op_sel,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let InsVOP3DPP8 = (ins VGPR_32:$old,
FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
- VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+ VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
let HasClamp = 0;
let HasExtVOP3DPP = 1;
@@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
FP32InputMods:$src2_modifiers, VGPR_32:$src2,
op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let InsVOP3DPP8 = (ins VGPR_32:$old,
FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
FP32InputMods:$src2_modifiers, VGPR_32:$src2,
- op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+ op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
let HasClamp = 0;
let HasSrc2 = 0;
let HasSrc2Mods = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 74f451b..ac3c8f9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
- neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi);
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi);
let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
DppRowMask:$row_mask, DppBankMask:$bank_mask,
- DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
}
multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
@@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
let OtherPredicates = ps.OtherPredicates;
+ let IsPacked = ps.IsPacked;
}
class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
+ let IsPacked = ps.IsPacked;
}
//===----------------------------------------------------------------------===//
@@ -1486,7 +1488,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
: VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
Gen.Subtarget> {
let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1496,7 +1498,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> {
let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1613,7 +1615,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
- let SubtargetPredicate = isGFX940Plus,
+ let AssemblerPredicate = isGFX940Plus,
DecoderNamespace = "GFX940",
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index fe52a0e..e5e8244 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
let AsmVariantName = AMDGPUAsmVariants.Default;
let SubtargetPredicate = AssemblerPredicate;
+
+ string DecoderNamespace; // dummy
}
multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> {
@@ -766,7 +768,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP16 = AsmDPP#"$fi";
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
@@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
//===----------------------------------------------------------------------===//
multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
- VOPCe<op{7-0}>;
- def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
+ VOPCe<op{7-0}>;
+ def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
- def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
- let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
- let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
+ def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
- def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
+ def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
- def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
+ def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
string asm_name, string pseudo_mnemonic = ""> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix :
- // 32 and 64 bit forms of the instruction have _e32 and _e64
- // respectively appended to their assembly mnemonic.
- // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
- // the destination-less 32bit forms add it to the asmString here.
- VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
- VOPCe<op{7-0}>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
- pseudo_mnemonic),
- asm_name, ps32.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>;
- def _e64#Gen.Suffix :
- VOP3_Real<ps64, Gen.Subtarget, asm_name>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
- pseudo_mnemonic),
- asm_name, ps64.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix :
+ // 32 and 64 bit forms of the instruction have _e32 and _e64
+ // respectively appended to their assembly mnemonic.
+ // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+ // the destination-less 32bit forms add it to the asmString here.
+ VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
+ VOPCe<op{7-0}>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
+ pseudo_mnemonic),
+ asm_name, ps32.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>;
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
+ pseudo_mnemonic),
+ asm_name, ps64.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- Gen.Subtarget, asm_name>;
- def _e32_dpp_w32#Gen.Suffix
- : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp_w64#Gen.Suffix
- : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
+ def _e32_dpp_w32#Gen.Suffix
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64#Gen.Suffix
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
- def _e32_dpp8_w32#Gen.Suffix
- : VOPC_DPP8<op{7-0}, ps32, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp8_w64#Gen.Suffix
- : VOPC_DPP8<op{7-0}, ps32, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ def _e32_dpp8_w32#Gen.Suffix
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64#Gen.Suffix
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64#Gen.Suffix
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
- def _e64_dpp8_w32#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+ def _e64_dpp8_w32#Gen.Suffix
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64#Gen.Suffix
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix :
- VOPC_Real<ps32, Gen.Subtarget>,
- VOPCe<op{7-0}> {
- let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
- # " " # ps32.AsmOperands;
- }
- def _e64#Gen.Suffix :
- VOP3_Real<ps64, Gen.Subtarget>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- let Inst{7-0} = ?; // sdst
- let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
- # "{_e64} " # ps64.AsmOperands;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix :
+ VOPC_Real<ps32, Gen.Subtarget>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+ # " " # ps32.AsmOperands;
+ }
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+ # "{_e64} " # ps64.AsmOperands;
+ }
defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix
- : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
- let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
- }
+ def _e32_dpp#Gen.Suffix
+ : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix
- : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
- let AsmString = !subst("_nosdst", "", psDPP.OpName)
- # "{_e64_dpp} " # AsmDPP;
- }
+ def _e64_dpp#Gen.Suffix
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName)
+ # "{_e64_dpp} " # AsmDPP;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
- let AsmString = !subst("_nosdst", "", ps64.OpName)
- # "{_e64_dpp} " # AsmDPP8;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+ let AsmString = !subst("_nosdst", "", ps64.OpName)
+ # "{_e64_dpp} " # AsmDPP8;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
string asm_name, string pseudo_mnemonic = ""> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix
- : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
- pseudo_mnemonic),
- asm_name, ps32.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>,
- VOPCe<op{7-0}> {
- let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
- }
- def _e64#Gen.Suffix
- : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
- pseudo_mnemonic),
- asm_name, ps64.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- let Inst{7-0} = ? ; // sdst
- let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix
+ : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
+ pseudo_mnemonic),
+ asm_name, ps32.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>,
+ VOPCe<op{7-0}> {
+ let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+ }
+ def _e64#Gen.Suffix
+ : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
+ pseudo_mnemonic),
+ asm_name, ps64.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ? ; // sdst
+ let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ }
defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- Gen.Subtarget, asm_name>;
- }
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix
- : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
- let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
- }
+ def _e64_dpp#Gen.Suffix
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>;
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Only in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass VOPC_Real_gfx10<bits<9> op> {
- let DecoderNamespace = "GFX10" in {
- def _e32_gfx10 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
- VOPCe<op{7-0}>;
- def _e64_gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = "GFX10"
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
@@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in {
}
multiclass VOPCX_Real_gfx10<bits<9> op> {
- let DecoderNamespace = "GFX10" in {
- def _e32_gfx10 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
- VOPCe<op{7-0}> {
- let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
- # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
- }
-
- def _e64_gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
- VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
- let Inst{7-0} = ?; // sdst
- let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
- # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
- }
- } // End DecoderNamespace = "GFX10"
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+ # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+ }
+
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+ # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ }
if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
@@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in {
defm : VOPCXInstAliases<NAME, "gfx10">;
}
-} // End AssemblerPredicate = isGFX10Only
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>;
defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>;
@@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>;
// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX6GFX7 in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
- let DecoderNamespace = "GFX6GFX7" in {
- def _e32_gfx6_gfx7 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOPCe<op{7-0}>;
- def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = "GFX6GFX7"
+ def _e32_gfx6_gfx7 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
}
-} // End AssemblerPredicate = isGFX6GFX7
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 801afab..80d7d96 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let VALU = 1;
let DPP = 1;
let Size = 8;
+ let IsPacked = P.IsPacked;
let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
@@ -835,7 +836,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
- let DecoderNamespace = "DPP";
+ let DecoderNamespace = "GFX8";
VOPProfile Pfl = P;
}
@@ -906,7 +907,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
- let DecoderNamespace = "DPP";
+ let DecoderNamespace = "GFX8";
}
class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -1350,7 +1351,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1463,7 +1464,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName,
multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1473,7 +1474,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME>
def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
let Inst{11} = ?;
let Inst{12} = ?;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1482,7 +1483,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.Pfl.AsmVOP3DPP8,
- DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
NoTrue16Predicate) in {
@@ -1505,7 +1506,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName,
defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> {
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1514,7 +1515,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> {
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index c5199aab..00a29f8 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -25,42 +25,6 @@
using namespace llvm;
using namespace LegalizeActions;
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void addAndInterleaveWithUnsupported(
- LegacyLegalizerInfo::SizeAndActionsVec &result,
- const LegacyLegalizerInfo::SizeAndActionsVec &v) {
- for (unsigned i = 0; i < v.size(); ++i) {
- result.push_back(v[i]);
- if (i + 1 < v[i].first && i + 1 < v.size() &&
- v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
- }
-}
-
-static LegacyLegalizerInfo::SizeAndActionsVec
-widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegacyLegalizerInfo::SizeAndActionsVec result = {
- {1, LegacyLegalizeActions::Unsupported},
- {8, LegacyLegalizeActions::WidenScalar},
- {9, LegacyLegalizeActions::Unsupported},
- {16, LegacyLegalizeActions::WidenScalar},
- {17, LegacyLegalizeActions::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
- return result;
-}
-
static bool AEABI(const ARMSubtarget &ST) {
return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
}
@@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
.libcallFor({s32})
.clampScalar(0, s32, s32);
- for (unsigned Op : {G_SREM, G_UREM}) {
- LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
- if (HasHWDivide)
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower);
- else if (AEABI(ST))
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom);
- else
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall);
- }
+ auto &REMBuilder =
+ getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32);
+ if (HasHWDivide)
+ REMBuilder.lowerFor({s32});
+ else if (AEABI(ST))
+ REMBuilder.customFor({s32});
+ else
+ REMBuilder.libcallFor({s32});
getActionDefinitionsBuilder(G_INTTOPTR)
.legalFor({{p0, s32}})
@@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
LoadStoreBuilder.maxScalar(0, s32);
- for (auto Ty : {s32, s64})
- LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower);
+ getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64});
getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 5215813..8a3454c 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">;
def UnaryFloatCategory : DXILOpCategory<"Unary float">;
def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
-// Following are the scalar types supported by DXIL operations and are synonymous
-// to llvm_*_ty defined for readability and ease of use in the context of this file.
-
-def voidTy : LLVMType<isVoid>;
-
-// Floating point types
-def f16Ty : LLVMType<f16>;
-def f32Ty : LLVMType<f32>;
-def f64Ty : LLVMType<f64>;
-
-// Integer types
-def i1Ty : LLVMType<i1>;
-def i8Ty : LLVMType<i8>;
-def i16Ty : LLVMType<i16>;
-def i32Ty : LLVMType<i32>;
-def i64Ty : LLVMType<i64>;
+// Represent as any pointer type with an option to change to a qualified pointer
+// type with address space specified.
+def dxil_handle_ty : LLVMAnyPointerType;
+def dxil_cbuffer_ty : LLVMAnyPointerType;
+def dxil_resource_ty : LLVMAnyPointerType;
// The parameter description for a DXIL operation
-class DXILOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, LLVMType type, string name, string doc,
bit isConstant = 0, string enumName = "",
int maxValue = 0> {
int Pos = pos; // Position in parameter list
- string Type = type; // LLVM type name, $o for overload, $r for resource
- // type, $cb for legacy cbuffer, $u4 for u4 struct
+ LLVMType ParamType = type; // Parameter type
string Name = name; // Short, unique parameter name
string Doc = doc; // Description of this parameter
bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR
@@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
- [f16Ty,f32Ty], ReadNone,
+ [llvm_half_ty, llvm_float_ty], ReadNone,
[
- DXILOpParameter<0, "$o", "", "operation result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "$o", "value", "input value">
+ DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value">
],
["floats"]>,
LLVMIntrinsic<int_sin>;
-def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
- [i16Ty,i32Ty,i64Ty], ReadNone,
+def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+ [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone,
[
- DXILOpParameter<0, "$o", "", "operation result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "$o", "a", "input value">,
- DXILOpParameter<3, "$o", "b", "input value">
+ DXILOpParameter<0, llvm_anyint_ty, "", "operation result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_anyint_ty, "a", "input value">,
+ DXILOpParameter<3, llvm_anyint_ty, "b", "input value">
],
["uints"]>,
LLVMIntrinsic<int_umax>;
-def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty], ReadNone,
+def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "thread ID component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read (x,y,z)">
+ DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
]>,
LLVMIntrinsic<int_dx_thread_id>;
-def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty], ReadNone,
+def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "group ID component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read">
+ DXILOpParameter<0, llvm_i32_ty, "", "group ID component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read">
]>,
LLVMIntrinsic<int_dx_group_id>;
-def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
- "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty], ReadNone,
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
+ "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "thread ID in group component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read (x,y,z)">
+ DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
]>,
LLVMIntrinsic<int_dx_thread_id_in_group>;
-def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
- "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty], ReadNone,
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
+ "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">
+ DXILOpParameter<0, llvm_i32_ty, "", "result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">
]>,
LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 76f99b4..2870f0b 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen
HexagonFrameLowering.cpp
HexagonGenExtract.cpp
HexagonGenInsert.cpp
+ HexagonGenMemAbsolute.cpp
HexagonGenMux.cpp
HexagonGenPredicate.cpp
HexagonHardwareLoops.cpp
@@ -50,6 +51,7 @@ add_llvm_target(HexagonCodeGen
HexagonOptAddrMode.cpp
HexagonOptimizeSZextends.cpp
HexagonPeephole.cpp
+ HexagonPostIncOpt.cpp
HexagonRDFOpt.cpp
HexagonRegisterInfo.cpp
HexagonSelectionDAGInfo.cpp
@@ -60,6 +62,7 @@ add_llvm_target(HexagonCodeGen
HexagonTargetMachine.cpp
HexagonTargetObjectFile.cpp
HexagonTargetTransformInfo.cpp
+ HexagonTfrCleanup.cpp
HexagonVectorCombine.cpp
HexagonVectorLoopCarriedReuse.cpp
HexagonVectorPrint.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6024d9f..3b8234c 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
return false;
const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
RegHalf H;
- if (!matchHalf(0, RC, 0, H))
+ unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0;
+ if (!matchHalf(0, RC, B, H))
return false;
if (H.Low)
return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
new file mode 100644
index 0000000..afd4963
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -0,0 +1,274 @@
+//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This pass traverses through all the basic blocks in a function and converts
+// an indexed load/store with offset "0" to a absolute-set load/store
+// instruction as long as the use of the register in the new instruction
+// dominates the rest of the uses and there are more than 2 uses.
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "hexagon-abs"
+
+using namespace llvm;
+
+STATISTIC(HexagonNumLoadAbsConversions,
+ "Number of Load instructions converted to absolute-set form");
+STATISTIC(HexagonNumStoreAbsConversions,
+ "Number of Store instructions converted to absolute-set form");
+
+namespace llvm {
+FunctionPass *createHexagonGenMemAbsolute();
+void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry);
+} // namespace llvm
+
+namespace {
+
+class HexagonGenMemAbsolute : public MachineFunctionPass {
+ const HexagonInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+
+public:
+ static char ID;
+ HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {
+ initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Generate Load/Store Set Absolute Address Instruction";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ static bool isValidIndexedLoad(int &Opcode, int &NewOpcode);
+ static bool isValidIndexedStore(int &Opcode, int &NewOpcode);
+};
+} // namespace
+
+char HexagonGenMemAbsolute::ID = 0;
+
+INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute",
+ "Hexagon Generate Load/Store Set Absolute Address Instruction",
+ false, false)
+
+bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(Fn.getFunction()))
+ return false;
+
+ TII = Fn.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ MRI = &Fn.getRegInfo();
+ TRI = Fn.getRegInfo().getTargetRegisterInfo();
+
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+ // Loop over all of the basic blocks
+ for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+ MBBb != MBBe; ++MBBb) {
+ MachineBasicBlock *MBB = &*MBBb;
+ // Traverse the basic block
+ for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+ ++MII) {
+ MachineInstr *MI = &*MII;
+ int Opc = MI->getOpcode();
+ if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi)
+ continue;
+
+ const MachineOperand &MO = MI->getOperand(0);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ unsigned DstReg = MO.getReg();
+ if (MRI->use_nodbg_empty(DstReg))
+ continue;
+
+ typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+ use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg);
+
+ MachineInstr *NextMI = NextUseMI->getParent();
+ int NextOpc = NextMI->getOpcode();
+ int NewOpc;
+ bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc);
+
+ if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc))
+ continue;
+
+ // Base and Offset positions for load and store instructions
+ // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm)
+ // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src)
+ unsigned BaseRegPos, ImmPos, RegPos;
+ if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos))
+ continue;
+ RegPos = IsLoad ? 0 : 2;
+
+ bool IsGlobal = MI->getOperand(1).isGlobal();
+ if (!MI->getOperand(1).isImm() && !IsGlobal)
+ continue;
+
+ const MachineOperand *BaseOp = nullptr;
+ int64_t Offset;
+ bool Scalable;
+ TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI);
+
+ // Ensure BaseOp is non-null and register type.
+ if (!BaseOp || !BaseOp->isReg())
+ continue;
+
+ if (Scalable)
+ continue;
+
+ unsigned BaseReg = BaseOp->getReg();
+ if ((DstReg != BaseReg) || (Offset != 0))
+ continue;
+
+ const MachineOperand &MO0 = NextMI->getOperand(RegPos);
+
+ if (!MO0.isReg())
+ continue;
+
+ unsigned LoadStoreReg = MO0.getReg();
+
+ // Store: Bail out if the src and base are same (def and use on same
+ // register).
+ if (LoadStoreReg == BaseReg)
+ continue;
+
+ // Insert the absolute-set instruction "I" only if the use of the
+ // BaseReg in "I" dominates the rest of the uses of BaseReg and if
+ // there are more than 2 uses of this BaseReg.
+ bool Dominates = true;
+ unsigned Counter = 0;
+ for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) {
+ Counter++;
+ if (!MDT.dominates(NextMI, I->getParent()))
+ Dominates = false;
+ }
+
+ if ((!Dominates) || (Counter < 3))
+ continue;
+
+ // If we reach here, we have met all the conditions required for the
+ // replacement of the absolute instruction.
+ LLVM_DEBUG({
+ dbgs() << "Found a pair of instructions for absolute-set "
+ << (IsLoad ? "load" : "store") << "\n";
+ dbgs() << *MI;
+ dbgs() << *NextMI;
+ });
+ MachineBasicBlock *ParentBlock = NextMI->getParent();
+ MachineInstrBuilder MIB;
+ if (IsLoad) { // Insert absolute-set load instruction
+ ++HexagonNumLoadAbsConversions;
+ MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+ TII->get(NewOpc), LoadStoreReg)
+ .addReg(DstReg, RegState::Define);
+ } else { // Insert absolute-set store instruction
+ ++HexagonNumStoreAbsConversions;
+ MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+ TII->get(NewOpc), DstReg);
+ }
+
+ MachineOperand ImmOperand = MI->getOperand(1);
+ if (IsGlobal)
+ MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(),
+ ImmOperand.getTargetFlags());
+ else
+ MIB.addImm(ImmOperand.getImm());
+
+ if (IsLoad)
+ MIB->getOperand(0).setSubReg(MO0.getSubReg());
+ else
+ MIB.addReg(LoadStoreReg, 0, MO0.getSubReg());
+
+ LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n");
+ // Erase the instructions that got replaced.
+ MII = MBB->erase(MI);
+ --MII;
+ NextMI->getParent()->erase(NextMI);
+ }
+ }
+
+ return true;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) {
+
+ bool Result = true;
+ switch (Opc) {
+ case Hexagon::L2_loadrb_io:
+ NewOpc = Hexagon::L4_loadrb_ap;
+ break;
+ case Hexagon::L2_loadrh_io:
+ NewOpc = Hexagon::L4_loadrh_ap;
+ break;
+ case Hexagon::L2_loadri_io:
+ NewOpc = Hexagon::L4_loadri_ap;
+ break;
+ case Hexagon::L2_loadrd_io:
+ NewOpc = Hexagon::L4_loadrd_ap;
+ break;
+ case Hexagon::L2_loadruh_io:
+ NewOpc = Hexagon::L4_loadruh_ap;
+ break;
+ case Hexagon::L2_loadrub_io:
+ NewOpc = Hexagon::L4_loadrub_ap;
+ break;
+ default:
+ Result = false;
+ }
+
+ return Result;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) {
+
+ bool Result = true;
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ NewOpc = Hexagon::S4_storerd_ap;
+ break;
+ case Hexagon::S2_storeri_io:
+ NewOpc = Hexagon::S4_storeri_ap;
+ break;
+ case Hexagon::S2_storerh_io:
+ NewOpc = Hexagon::S4_storerh_ap;
+ break;
+ case Hexagon::S2_storerb_io:
+ NewOpc = Hexagon::S4_storerb_ap;
+ break;
+ default:
+ Result = false;
+ }
+
+ return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonGenMemAbsolute() {
+ return new HexagonGenMemAbsolute();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 619c7dc..91cc930 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
return getAddrMode(MI) == HexagonII::PostInc;
}
+bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
+ unsigned BasePos, OffsetPos;
+ if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return false;
+ return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
+}
+
// Returns true if an instruction is predicated irrespective of the predicate
// sense. For example, all of the following will return true.
// if (p0) R1 = add(R2, R3)
@@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
Opcode == Hexagon::J2_loop1rext;
}
+bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case Hexagon::L2_loadalignb_pci:
+ case Hexagon::L2_loadalignb_pcr:
+ case Hexagon::L2_loadalignh_pci:
+ case Hexagon::L2_loadalignh_pcr:
+ case Hexagon::L2_loadbsw2_pci:
+ case Hexagon::L2_loadbsw2_pcr:
+ case Hexagon::L2_loadbsw4_pci:
+ case Hexagon::L2_loadbsw4_pcr:
+ case Hexagon::L2_loadbzw2_pci:
+ case Hexagon::L2_loadbzw2_pcr:
+ case Hexagon::L2_loadbzw4_pci:
+ case Hexagon::L2_loadbzw4_pcr:
+ case Hexagon::L2_loadrb_pci:
+ case Hexagon::L2_loadrb_pcr:
+ case Hexagon::L2_loadrd_pci:
+ case Hexagon::L2_loadrd_pcr:
+ case Hexagon::L2_loadrh_pci:
+ case Hexagon::L2_loadrh_pcr:
+ case Hexagon::L2_loadri_pci:
+ case Hexagon::L2_loadri_pcr:
+ case Hexagon::L2_loadrub_pci:
+ case Hexagon::L2_loadrub_pcr:
+ case Hexagon::L2_loadruh_pci:
+ case Hexagon::L2_loadruh_pcr:
+ case Hexagon::S2_storerbnew_pci:
+ case Hexagon::S2_storerbnew_pcr:
+ case Hexagon::S2_storerb_pci:
+ case Hexagon::S2_storerb_pcr:
+ case Hexagon::S2_storerd_pci:
+ case Hexagon::S2_storerd_pcr:
+ case Hexagon::S2_storerf_pci:
+ case Hexagon::S2_storerf_pcr:
+ case Hexagon::S2_storerhnew_pci:
+ case Hexagon::S2_storerhnew_pcr:
+ case Hexagon::S2_storerh_pci:
+ case Hexagon::S2_storerh_pcr:
+ case Hexagon::S2_storerinew_pci:
+ case Hexagon::S2_storerinew_pcr:
+ case Hexagon::S2_storeri_pci:
+ case Hexagon::S2_storeri_pcr:
+ return true;
+ }
+ return false;
+}
+
bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index e496995..65783c5 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -434,6 +434,8 @@ public:
bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
bool PredOpcodeHasJMP_c(unsigned Opcode) const;
bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+ bool isPostIncWithImmOffset(const MachineInstr &MI) const;
+ bool isCircBufferInstr(const MachineInstr &MI) const;
unsigned getAddrMode(const MachineInstr &MI) const;
MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
new file mode 100644
index 0000000..4c845f2
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
@@ -0,0 +1,689 @@
+//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Convert post-inc addressing mode into base-offset addressing mode.
+// Ex:
+// original loop:
+// v1 = phi(v0, v3)
+// v2,v3 = post_load v1, 4
+
+// Often, unroller creates below form of post-increments:
+// v1 = phi(v0, v3')
+// v2,v3 = post_load v1, 4
+// v2',v3'= post_load v3, 4
+
+// This can be optimized in two ways
+
+// 1.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v3', -4
+//
+// 2.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v1, 4
+//
+// Option 2 is favored as we can packetize two memory operations in a single
+// packet. However, this is not always favorable due to memory dependences
+// and in cases where we form a bigger chain of post-increment ops that will
+// create more spills as we can not execute post-increment ops with out
+// executing base-offset instructions.
+//===----------------------------------------------------------------------===//
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-postincopt"
+
+static cl::opt<unsigned> PostIncChainThreshold(
+ "post-inc-chain-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Limit the number of post-inc instructions in a chain."));
+
+static cl::opt<bool> PreferPostIncStore(
+ "prefer-post-inc-store", cl::Hidden, cl::init(true),
+ cl::desc("Prefer post-inc store in a list of loads and stores."));
+
+namespace llvm {
+void initializeHexagonPostIncOptPass(PassRegistry &);
+FunctionPass *createHexagonPostIncOpt();
+} // namespace llvm
+
+namespace {
+
+class HexagonPostIncOpt : public MachineFunctionPass {
+ MachineLoopInfo *MLI = nullptr;
+ const HexagonInstrInfo *HII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+ const HexagonSubtarget *HST = nullptr;
+
+public:
+ static char ID;
+
+ HexagonPostIncOpt() : MachineFunctionPass(ID) {
+ initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ bool translatePostIncsInLoop(MachineBasicBlock &MBB);
+ void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
+ void replacePostIncWithBaseOffset(MachineInstr &MI) const;
+ bool isPostIncInsn(MachineInstr &MI) const;
+ void foldAdds(MachineBasicBlock &MBB) const;
+ void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
+ void removeDeadInstructions(MachineBasicBlock &MBB) const;
+
+ void generatePostInc(MachineBasicBlock &MBB);
+ bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+ void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+
+ bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
+ bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
+};
+
+class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
+ HexagonPostIncOpt &Pass;
+
+public:
+ HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
+ MachineLoopInfo *MLI)
+ : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
+ void schedule() override;
+ ScheduleDAGTopologicalSort &getTopo() { return Topo; };
+};
+
+} // End anonymous namespace.
+
+char HexagonPostIncOpt::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
+ "Hexagon Post-Inc-Opt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
+ false, false)
+
+/// Return true if MIA dominates MIB.
+static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
+ if (MIA->getParent() != MIB->getParent())
+ return false; // Don't know since machine dominator tree is out of date.
+
+ MachineBasicBlock *MBB = MIA->getParent();
+ MachineBasicBlock::iterator I = MBB->instr_begin();
+ // Iterate over the basic block until MIA or MIB is found.
+ for (; &*I != MIA && &*I != MIB; ++I)
+ ;
+
+ // MIA dominates MIB if MIA is found first.
+ return &*I == MIA;
+}
+
+// Return the Phi register value that comes from the loop block.
+static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+ return Phi->getOperand(i).getReg();
+ return UINT_MAX;
+}
+
+static bool isAddWithImmValue(const MachineInstr &MI) {
+ // FIXME: For now, only deal with adds that have strict immediate values.
+ // Some A2_addi instructions can be of the form.
+ // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
+ return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
+}
+
+// Compute the number of 'real' instructions in the basic block by
+// ignoring terminators.
+static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
+ unsigned size = 0;
+ for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
+ if (!I.isDebugInstr())
+ size++;
+ return size;
+}
+
+// Setup Post increment Schedule DAG.
+static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
+ MachineBasicBlock &MBB) {
+ PIDAG.startBlock(&MBB);
+ PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
+ getBasicBlockSize(MBB));
+ // Build the graph.
+ PIDAG.schedule();
+ // exitRegion() is an empty function in base class. So, safe to call it here.
+ PIDAG.exitRegion();
+}
+
+// Check if post-increment candidate has any memory dependence on any
+// instruction in the chain.
+static bool hasMemoryDependency(SUnit *PostIncSU,
+ SmallVector<MachineInstr *, 4> &UseList) {
+
+ // FIXME: Fine tune the order dependence. Probably can only consider memory
+ // related OrderKind.
+ for (auto &Dep : PostIncSU->Succs)
+ if (Dep.getKind() == SDep::Order)
+ if (std::find(UseList.begin(), UseList.end(),
+ Dep.getSUnit()->getInstr()) != UseList.end())
+ return true;
+
+ return false;
+}
+
+// Fold an add with immediate into either an add or a load or a store.
+void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
+ LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
+ for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+ if (!isAddWithImmValue(MI))
+ continue;
+ unsigned DefReg = MI.getOperand(0).getReg();
+ unsigned AddReg = MI.getOperand(1).getReg();
+ int64_t AddImm = MI.getOperand(2).getImm();
+
+ SmallVector<MachineInstr *, 4> UseList;
+ // Gather the uses of add instruction's def reg.
+ for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
+ MachineInstr *UseMI = MO.getParent();
+ // Deal with only the instuctions that belong to this block.
+ // If we cross this block, the generation of post-increment logic
+ // will not be able to transform to post-inc due to dominance.
+ if (UseMI->getParent() == &MBB)
+ UseList.push_back(UseMI);
+ }
+
+ if (UseList.empty())
+ continue;
+
+ LLVM_DEBUG({
+ dbgs() << "Current instruction considered for folding \n";
+ MI.dump();
+ });
+
+ for (auto UseMI : UseList) {
+ if (isAddWithImmValue(*UseMI)) {
+ int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
+ // Fold if the new immediate is with in the range.
+ if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is folded in to \n";
+ });
+ UseMI->getOperand(1).setReg(AddReg);
+ UseMI->getOperand(2).setImm(NewImm);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ } else if (HII->isBaseImmOffset(*UseMI)) {
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is folded in to \n";
+ });
+ updateBaseAndOffset(*UseMI, MI);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ removeDeadInstructions(MBB);
+ LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
+}
+
+void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
+ MachineInstr &AddMI) const {
+ assert(HII->isBaseImmOffset(MI));
+ unsigned BasePos, OffsetPos;
+ if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return;
+
+ MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
+ MachineOperand &BaseOp = MI.getOperand(BasePos);
+
+ if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
+ return;
+
+ unsigned IncBase = AddMI.getOperand(1).getReg();
+ int64_t IncValue = AddMI.getOperand(2).getImm();
+
+ int64_t NewOffset = OffsetOp.getImm() + IncValue;
+ if (!isValidOffset(MI, NewOffset))
+ return;
+
+ OffsetOp.setImm(NewOffset);
+ BaseOp.setReg(IncBase);
+}
+
+void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
+ // For MBB, check that the value defined by each instruction is used.
+ // If not, delete it.
+ for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
+ ME = MBB.instr_rend();
+ MI != ME;) {
+ // From DeadMachineInstructionElem. Don't delete inline assembly.
+ if (MI->isInlineAsm()) {
+ ++MI;
+ continue;
+ }
+ bool SawStore = false;
+ // Check if it's safe to remove the instruction due to side effects.
+ if (!MI->isSafeToMove(nullptr, SawStore)) {
+ ++MI;
+ continue;
+ }
+ unsigned Uses = 0;
+ for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+ MOE = MI->operands_end();
+ MOI != MOE; ++MOI) {
+ if (!MOI->isReg() || !MOI->isDef())
+ continue;
+ unsigned reg = MOI->getReg();
+ // Assume physical registers are used.
+ if (Register::isPhysicalRegister(reg)) {
+ Uses++;
+ continue;
+ }
+ if (MRI->use_begin(reg) != MRI->use_end())
+ Uses++;
+ }
+ if (!Uses) {
+ MI++->eraseFromParent();
+ continue;
+ }
+ ++MI;
+ }
+}
+
+bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
+ // Predicated post-increments are not yet handled. (ISel is not generating
+ // them yet). Circular buffer instructions should not be handled.
+ return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
+ !HII->isCircBufferInstr(MI));
+}
+
+/// For instructions with a base and offset, return true if the new Offset
+/// is a valid value with the correct alignment.
+bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
+ int64_t Offset) const {
+ if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
+ return false;
+ unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+ return (Offset & AlignMask) == 0;
+}
+
+bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
+ int IncVal) const {
+ unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+ if ((IncVal & AlignMask) != 0)
+ return false;
+
+ // Number of total bits in the instruction used to encode Inc value.
+ unsigned IncBits = 4;
+ // For HVX instructions, the offset is 3.
+ if (HexagonII::isCVI(MI.getDesc()))
+ IncBits = 3;
+
+ IncBits += Log2_32(HII->getMemAccessSize(MI));
+ if (HII->getMemAccessSize(MI) > 8)
+ IncBits = 16;
+
+ int MinValidVal = -1U << (IncBits - 1);
+ int MaxValidVal = ~(-1U << (IncBits - 1));
+ return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
+}
+
+void HexagonPostIncOptSchedDAG::schedule() {
+ AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+ buildSchedGraph(AA);
+}
+
+// Replace post-increment operations with base+offset counterpart.
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(
+ MachineBasicBlock &MBB) const {
+ LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
+ "base+offset counterparts.\n");
+
+ SmallVector<MachineInstr *, 4> MIList;
+ for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+ // Check for eligible post-inc candidates.
+ if (!isPostIncInsn(MI))
+ continue;
+ MIList.push_back(&MI);
+ }
+
+ for (auto MI : MIList)
+ replacePostIncWithBaseOffset(*MI);
+
+ LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
+}
+
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
+ short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
+ if (NewOpcode < 0)
+ return;
+
+ unsigned BasePos = 0, OffsetPos = 0;
+ if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return;
+ const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
+ const MachineOperand &PostIncBase = MI.getOperand(BasePos);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand *PostIncDest;
+ MachineInstrBuilder MIB;
+ if (MI.mayLoad()) {
+ PostIncDest = &MI.getOperand(1);
+ const MachineOperand &LDValue = MI.getOperand(0);
+ MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+ MIB.add(LDValue).add(PostIncBase).addImm(0);
+ } else {
+ PostIncDest = &MI.getOperand(0);
+ const MachineOperand &STValue = MI.getOperand(3);
+ MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+ MIB.add(PostIncBase).addImm(0).add(STValue);
+ }
+
+ // Transfer memoperands.
+ MIB->cloneMemRefs(*MBB.getParent(), MI);
+
+ // Create an add instruction for the post-inc addition of offset.
+ MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
+ MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
+
+ LLVM_DEBUG({
+ dbgs() << "\n";
+ MI.dump();
+ dbgs() << "\tis tranformed to \n";
+ MIB->dump();
+ MIBA->dump();
+ dbgs() << "\n\n";
+ });
+
+ MI.eraseFromParent();
+}
+
+void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
+ LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
+ MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
+ MachineBasicBlock::iterator MIE = MBB.instr_begin();
+ bool isOK = true;
+ while (MII != MIE) {
+ MachineInstr *Phi = &*std::prev(MII);
+ MII = std::prev(MII);
+ unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
+ if (LoopVal == UINT_MAX)
+ continue;
+ MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
+ if (!isAddWithImmValue(*LoopInst))
+ continue;
+
+ if (LoopInst->getOpcode() != Hexagon::A2_addi)
+ continue;
+
+ unsigned AddReg = LoopInst->getOperand(1).getReg();
+ int64_t AddImm = LoopInst->getOperand(2).getImm();
+ SmallVector<MachineInstr *, 4> UseList;
+ MachineInstr *PostIncCandidate = nullptr;
+
+ // Find the probable candidates for Post-increment instruction.
+ SmallVector<MachineInstr *, 4> CandList;
+ for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
+ MachineInstr *UseMI = MO.getParent();
+
+ if (UseMI == LoopInst)
+ continue;
+
+ if (!dominates(UseMI, LoopInst)) {
+ isOK = false;
+ break;
+ }
+ const MachineOperand *BaseOp = nullptr;
+ int64_t Offset;
+ bool OffsetIsScalable;
+ if (!HII->isBaseImmOffset(*UseMI) ||
+ !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
+ OffsetIsScalable, TRI)) {
+ isOK = false;
+ break;
+ }
+ int64_t NewOffset = Offset - AddImm;
+ if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
+ BaseOp->getReg() != AddReg) {
+ isOK = false;
+ break;
+ }
+ if (OffsetIsScalable) {
+ isOK = false;
+ break;
+ }
+ if (Offset == 0) {
+ // If you have stores in the chain, make sure they are in the beginning
+ // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
+ // ST.
+ if (UseMI->mayStore() && PreferPostIncStore)
+ CandList.insert(CandList.begin(), UseMI);
+ else
+ CandList.push_back(UseMI);
+ continue;
+ }
+ UseList.push_back(UseMI);
+ }
+
+ if (!isOK)
+ continue;
+
+ for (auto MI : CandList) {
+ if (!PostIncCandidate)
+ PostIncCandidate = MI;
+ // Push the rest of the list for updation.
+ else
+ UseList.push_back(MI);
+ }
+
+ // If a candidate is found, replace it with the post-inc instruction.
+ // Also, adjust offset for other uses as needed.
+ if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
+ continue;
+
+ // Logic to determine what the base register to be.
+ // There are two choices:
+ // 1. New address register after we updated the post-increment candidate.
+ // v2,v3 = post_load v1, 4
+ // v3 is the choice here.
+ // 2. The base register we used in post-increment candidate.
+ // v2,v3 = post_load v1, 4
+ // v1 is the choice here.
+ // Use v3 if there is a memory dependence between post-inc instruction and
+ // any other instruction in the chain.
+ // FIXME: We can do some complex DAG analysis based off height and depth and
+ // selectively update other instructions in the chain. Use v3 if there are
+ // more instructions in the chain, otherwise we will end up increasing the
+ // height of the DAG resulting in more spills. By default we have a
+ // threshold controlled by the option "post-inc-chain-threshold" which is
+ // set to 4. v1 is preferred as we can packetize two memory operations in a
+ // single packet in scalar core. But it heavily depends on the structure of
+ // DAG.
+ bool UpdateBaseToNew = false;
+
+ // Do not bother to build a DAG and analyze if the Use list is empty.
+ if (!UseList.empty()) {
+ MachineFunction *MF = MBB.getParent();
+ // Setup the Post-inc schedule DAG.
+ HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
+ initPISchedDAG(PIDAG, MBB);
+ SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
+ if (hasMemoryDependency(SU, UseList) ||
+ UseList.size() >= PostIncChainThreshold)
+ UpdateBaseToNew = true;
+ }
+
+ if (UpdateBaseToNew) {
+ LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
+ "base register of post-increment\n");
+ for (auto UseMI : UseList) {
+ if (!dominates(PostIncCandidate, UseMI))
+ continue;
+ unsigned BasePos, OffsetPos;
+ if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
+ // New offset has already been validated; no need to do it again.
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is transformed to \n";
+ });
+ int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
+ UseMI->getOperand(OffsetPos).setImm(NewOffset);
+ UseMI->getOperand(BasePos).setReg(LoopVal);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ }
+ }
+ replaceWithPostInc(PostIncCandidate, LoopInst);
+ }
+ LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
+}
+
+bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
+ MachineInstr *AddMI) const {
+ if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
+ return false;
+ assert(AddMI->getOpcode() == Hexagon::A2_addi);
+ return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
+}
+
+void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
+ MachineInstr *AddMI) const {
+ short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
+ assert(NewOpcode >= 0 &&
+ "Couldn't change base offset to post-increment form");
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ const MachineOperand &IncDest = AddMI->getOperand(0);
+ const MachineOperand &IncBase = AddMI->getOperand(1);
+ const MachineOperand &IncValue = AddMI->getOperand(2);
+ MachineInstrBuilder MIB;
+ LLVM_DEBUG({
+ dbgs() << "\n\n";
+ MI->dump();
+ dbgs() << "\t is tranformed to post-inc form of \n";
+ });
+
+ if (MI->mayLoad()) {
+ const MachineOperand &LDValue = MI->getOperand(0);
+ MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+ MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
+ } else {
+ const MachineOperand &STValue = MI->getOperand(2);
+ MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+ MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
+ }
+
+ // Transfer memoperands.
+ MIB->cloneMemRefs(*MBB.getParent(), *MI);
+
+ LLVM_DEBUG({
+ MIB->dump();
+ dbgs() << "As a result this add instruction is erased.\n";
+ AddMI->dump();
+ });
+
+ MI->eraseFromParent();
+ AddMI->eraseFromParent();
+}
+
+bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
+ // Algorithm:
+ // 1. Replace all the post-inc instructions with Base+Offset instruction and
+ // an add instruction in this block.
+ // 2. Fold all the adds in to respective uses.
+ // 3. Generate post-increment instructions and update the uses of the base
+ // register if needed based on constraints.
+
+ replacePostIncWithBaseOffset(MBB);
+ foldAdds(MBB);
+ generatePostInc(MBB);
+ return true;
+}
+
+bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
+
+ // Skip pass if requested.
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ // Get Target Information.
+ MLI = &getAnalysis<MachineLoopInfo>();
+ HST = &MF.getSubtarget<HexagonSubtarget>();
+ TRI = HST->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ HII = HST->getInstrInfo();
+
+ // Skip this pass for TinyCore.
+ // Tiny core allwos partial post increment operations - This constraint can
+ // be imposed inside the pass. In a chain of post-increments, the first can
+ // be post-increment, rest can be adjusted to base+offset (these are
+ // inexpensive in most of the cases);
+ if (HST->isTinyCore())
+ return false;
+
+ LLVM_DEBUG({
+ dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
+ dbgs() << "Function: " << MF.getName() << "\n";
+ });
+ bool Change = false;
+ std::vector<MachineBasicBlock *> MLBB;
+ for (auto &BB : MF) {
+ // Check if this Basic Block belongs to any loop.
+ auto *LI = MLI->getLoopFor(&BB);
+ // We only deal with inner-most loops that has one block.
+ if (LI && LI->getBlocks().size() == 1) {
+ MachineBasicBlock *MBB = LI->getHeader();
+ // Do not traverse blocks that are already visited.
+ if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
+ continue;
+
+ MLBB.push_back(MBB);
+
+ LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
+ Change |= translatePostIncsInLoop(*MBB);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
+ return Change;
+}
+
+FunctionPass *llvm::createHexagonPostIncOpt() {
+ return new HexagonPostIncOpt();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d4b420..a5ebd64 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
cl::init(true), cl::Hidden,
cl::desc("Early expansion of MUX"));
+static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
+ cl::Hidden,
+ cl::desc("Cleanup of TFRs/COPYs"));
+
static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
cl::desc("Enable early if-conversion"));
@@ -92,6 +96,10 @@ static cl::opt<bool>
static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
cl::desc("Disable splitting double registers"));
+static cl::opt<bool>
+ EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden,
+ cl::desc("Generate absolute set instructions"));
+
static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
cl::Hidden, cl::desc("Bit simplification"));
@@ -121,6 +129,10 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
cl::init(true),
cl::desc("Enable instsimplify"));
+static cl::opt<bool> DisableHexagonPostIncOpt(
+ "hexagon-postinc-opt", cl::Hidden,
+ cl::desc("Disable Hexagon post-increment optimization"));
+
/// HexagonTargetMachineModule - Note that this is used on hosts that
/// cannot link in a library unless there are references into the
/// library. In particular, it seems that it is not possible to get
@@ -145,20 +157,24 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
namespace llvm {
extern char &HexagonExpandCondsetsID;
+ extern char &HexagonTfrCleanupID;
void initializeHexagonBitSimplifyPass(PassRegistry&);
void initializeHexagonConstExtendersPass(PassRegistry&);
void initializeHexagonConstPropagationPass(PassRegistry&);
void initializeHexagonCopyToCombinePass(PassRegistry&);
void initializeHexagonEarlyIfConversionPass(PassRegistry&);
void initializeHexagonExpandCondsetsPass(PassRegistry&);
+ void initializeHexagonGenMemAbsolutePass(PassRegistry &);
void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonHardwareLoopsPass(PassRegistry&);
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
void initializeHexagonNewValueJumpPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
+ void initializeHexagonPostIncOptPass(PassRegistry &);
void initializeHexagonRDFOptPass(PassRegistry&);
void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+ void initializeHexagonTfrCleanupPass(PassRegistry &);
void initializeHexagonVExtractPass(PassRegistry &);
void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
@@ -177,6 +193,7 @@ namespace llvm {
FunctionPass *createHexagonFixupHwLoops();
FunctionPass *createHexagonGenExtract();
FunctionPass *createHexagonGenInsert();
+ FunctionPass *createHexagonGenMemAbsolute();
FunctionPass *createHexagonGenMux();
FunctionPass *createHexagonGenPredicate();
FunctionPass *createHexagonHardwareLoops();
@@ -188,10 +205,12 @@ namespace llvm {
FunctionPass *createHexagonOptimizeSZextends();
FunctionPass *createHexagonPacketizer(bool Minimal);
FunctionPass *createHexagonPeephole();
+ FunctionPass *createHexagonPostIncOpt();
FunctionPass *createHexagonRDFOpt();
FunctionPass *createHexagonSplitConst32AndConst64();
FunctionPass *createHexagonSplitDoubleRegs();
FunctionPass *createHexagonStoreWidening();
+ FunctionPass *createHexagonTfrCleanup();
FunctionPass *createHexagonVectorCombineLegacyPass();
FunctionPass *createHexagonVectorPrint();
FunctionPass *createHexagonVExtract();
@@ -211,12 +230,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
initializeHexagonConstPropagationPass(PR);
initializeHexagonCopyToCombinePass(PR);
initializeHexagonEarlyIfConversionPass(PR);
+ initializeHexagonGenMemAbsolutePass(PR);
initializeHexagonGenMuxPass(PR);
initializeHexagonHardwareLoopsPass(PR);
initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
initializeHexagonNewValueJumpPass(PR);
initializeHexagonOptAddrModePass(PR);
initializeHexagonPacketizerPass(PR);
+ initializeHexagonPostIncOptPass(PR);
initializeHexagonRDFOptPass(PR);
initializeHexagonSplitDoubleRegsPass(PR);
initializeHexagonVectorCombineLegacyPass(PR);
@@ -244,6 +265,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
(HexagonNoOpt ? CodeGenOptLevel::None : OL)),
TLOF(std::make_unique<HexagonTargetObjectFile>()) {
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+ initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
+ initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
@@ -411,11 +434,20 @@ void HexagonPassConfig::addPreRegAlloc() {
addPass(createHexagonConstExtenders());
if (EnableExpandCondsets)
insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+ if (EnableTfrCleanup)
+ insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
if (!DisableStoreWidening)
addPass(createHexagonStoreWidening());
+ if (EnableGenMemAbs)
+ addPass(createHexagonGenMemAbsolute());
if (!DisableHardwareLoops)
addPass(createHexagonHardwareLoops());
}
+
+ if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
+ if (!DisableHexagonPostIncOpt)
+ addPass(createHexagonPostIncOpt());
+
if (TM->getOptLevel() >= CodeGenOptLevel::Default)
addPass(&MachinePipelinerID);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
new file mode 100644
index 0000000..a4b359a
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -0,0 +1,324 @@
+//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is to address a situation that appears after register allocaion
+// evey now and then, namely a register copy from a source that was defined
+// as an immediate value in the same block (usually just before the copy).
+//
+// Here is an example of actual code emitted that shows this problem:
+//
+// .LBB0_5:
+// {
+// r5 = zxtb(r8)
+// r6 = or(r6, ##12345)
+// }
+// {
+// r3 = xor(r1, r2)
+// r1 = #0 <-- r1 set to #0
+// }
+// {
+// r7 = r1 <-- r7 set to r1
+// r0 = zxtb(r3)
+// }
+
+#define DEBUG_TYPE "tfr-cleanup"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createHexagonTfrCleanup();
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+class HexagonTfrCleanup : public MachineFunctionPass {
+public:
+ static char ID;
+ HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {
+ PassRegistry &R = *PassRegistry::getPassRegistry();
+ initializeHexagonTfrCleanupPass(R);
+ }
+ StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ const HexagonInstrInfo *HII;
+ const TargetRegisterInfo *TRI;
+
+ typedef DenseMap<unsigned, uint64_t> ImmediateMap;
+
+ bool isIntReg(unsigned Reg, bool &Is32);
+ void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap);
+ bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap);
+ bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap);
+ bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes);
+ bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes);
+};
+} // namespace
+
+char HexagonTfrCleanup::ID = 0;
+
+namespace llvm {
+char &HexagonTfrCleanupID = HexagonTfrCleanup::ID;
+}
+
+bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) {
+ Is32 = Hexagon::IntRegsRegClass.contains(Reg);
+ return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg);
+}
+
+// Assign given value V32 to the specified the register R32 in the map. Only
+// 32-bit registers are valid arguments.
+void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) {
+ ImmediateMap::iterator F = IMap.find(R32);
+ if (F == IMap.end())
+ IMap.insert(std::make_pair(R32, V32));
+ else
+ F->second = V32;
+}
+
+// Retrieve a value of the provided register Reg and store it into Val.
+// Return "true" if a value was found, "false" otherwise.
+bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val,
+ ImmediateMap &IMap) {
+ bool Is32;
+ if (!isIntReg(Reg, Is32))
+ return false;
+
+ if (Is32) {
+ ImmediateMap::iterator F = IMap.find(Reg);
+ if (F == IMap.end())
+ return false;
+ Val = F->second;
+ return true;
+ }
+
+ // For 64-bit registers, compose the value from the values of its
+ // subregisters.
+ unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo);
+ unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi);
+ ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH);
+ if (FL == IMap.end() || FH == IMap.end())
+ return false;
+ Val = (FH->second << 32) | FL->second;
+ return true;
+}
+
+// Process an instruction and record the relevant information in the imme-
+// diate map.
+bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) {
+ using namespace Hexagon;
+
+ if (MI->isCall()) {
+ IMap.clear();
+ return true;
+ }
+
+ // If this is an instruction that loads a constant into a register,
+ // record this information in IMap.
+ unsigned Opc = MI->getOpcode();
+ if (Opc == A2_tfrsi || Opc == A2_tfrpi) {
+ unsigned DefR = MI->getOperand(0).getReg();
+ bool Is32;
+ if (!isIntReg(DefR, Is32))
+ return false;
+ if (!MI->getOperand(1).isImm()) {
+ if (!Is32) {
+ IMap.erase(TRI->getSubReg(DefR, isub_lo));
+ IMap.erase(TRI->getSubReg(DefR, isub_hi));
+ } else {
+ IMap.erase(DefR);
+ }
+ return false;
+ }
+ uint64_t Val = MI->getOperand(1).getImm();
+ // If it's a 64-bit register, break it up into subregisters.
+ if (!Is32) {
+ uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU);
+ setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap);
+ setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap);
+ } else {
+ setReg(DefR, Val, IMap);
+ }
+ return true;
+ }
+
+ // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap.
+ for (MachineInstr::mop_iterator Mo = MI->operands_begin(),
+ E = MI->operands_end();
+ Mo != E; ++Mo) {
+ if (Mo->isRegMask()) {
+ IMap.clear();
+ return true;
+ }
+ if (!Mo->isReg() || !Mo->isDef())
+ continue;
+ unsigned R = Mo->getReg();
+ for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) {
+ ImmediateMap::iterator F = IMap.find(*AR);
+ if (F != IMap.end())
+ IMap.erase(F);
+ }
+ }
+ return true;
+}
+
+// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that
+// has a known constant value.
+bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap,
+ SlotIndexes *Indexes) {
+ using namespace Hexagon;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case A2_tfr:
+ case A2_tfrp:
+ case COPY:
+ break;
+ default:
+ return false;
+ }
+
+ unsigned DstR = MI->getOperand(0).getReg();
+ unsigned SrcR = MI->getOperand(1).getReg();
+ bool Tmp, Is32;
+ if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp))
+ return false;
+ assert(Tmp == Is32 && "Register size mismatch");
+ uint64_t Val;
+ bool Found = getReg(SrcR, Val, IMap);
+ if (!Found)
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ int64_t SVal = Is32 ? int32_t(Val) : Val;
+ auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+ MachineInstr *NewMI;
+ if (Is32)
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal);
+ else if (isInt<8>(SVal))
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal);
+ else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL)))
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR)
+ .addImm(int32_t(SVal >> 32))
+ .addImm(int32_t(Val & 0xFFFFFFFFLL));
+ else if (HST.isTinyCore())
+ // Disable generating CONST64 since it requires load resource.
+ return false;
+ else
+ NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val);
+
+ // Replace the MI to reuse the same slot index
+ if (Indexes)
+ Indexes->replaceMachineInstrInMaps(*MI, *NewMI);
+ MI->eraseFromParent();
+ return true;
+}
+
+// Remove the instruction if it is a self-assignment.
+bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI,
+ SlotIndexes *Indexes) {
+ unsigned Opc = MI->getOpcode();
+ unsigned DefR, SrcR;
+ bool IsUndef = false;
+ switch (Opc) {
+ case Hexagon::A2_tfr:
+ // Rd = Rd
+ DefR = MI->getOperand(0).getReg();
+ SrcR = MI->getOperand(1).getReg();
+ IsUndef = MI->getOperand(1).isUndef();
+ break;
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ // if ([!]Pu) Rd = Rd
+ DefR = MI->getOperand(0).getReg();
+ SrcR = MI->getOperand(2).getReg();
+ IsUndef = MI->getOperand(2).isUndef();
+ break;
+ default:
+ return false;
+ }
+ if (DefR != SrcR)
+ return false;
+ if (IsUndef) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR);
+ for (auto &Op : MI->operands())
+ if (Op.isReg() && Op.isDef() && Op.isImplicit())
+ DefI->addOperand(Op);
+ }
+
+ if (Indexes)
+ Indexes->removeMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ return true;
+}
+
+bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ // Map: 32-bit register -> immediate value.
+ // 64-bit registers are stored through their subregisters.
+ ImmediateMap IMap;
+ SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>();
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ HII = HST.getInstrInfo();
+ TRI = HST.getRegisterInfo();
+
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ MachineBasicBlock &B = *I;
+ MachineBasicBlock::iterator J, F, NextJ;
+ IMap.clear();
+ bool Inserted = false, Erased = false;
+ for (J = B.begin(), F = B.end(); J != F; J = NextJ) {
+ NextJ = std::next(J);
+ MachineInstr *MI = &*J;
+ bool E = eraseIfRedundant(MI, Indexes);
+ Erased |= E;
+ if (E)
+ continue;
+ Inserted |= rewriteIfImm(MI, IMap, Indexes);
+ MachineBasicBlock::iterator NewJ = std::prev(NextJ);
+ updateImmMap(&*NewJ, IMap);
+ }
+ bool BlockC = Inserted | Erased;
+ Changed |= BlockC;
+ if (BlockC && Indexes)
+ Indexes->repairIndexesInRange(&B, B.begin(), B.end());
+ }
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false,
+ false)
+
+FunctionPass *llvm::createHexagonTfrCleanup() {
+ return new HexagonTfrCleanup();
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca98269..9840412 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -18,6 +18,7 @@
#include "HexagonDepITypes.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
namespace llvm {
@@ -48,7 +49,7 @@ namespace HexagonII {
// MCInstrDesc TSFlags
// *** Must match HexagonInstrFormat*.td ***
- enum {
+ enum HexagonTSFlagsVal {
// This 7-bit field describes the insn type.
TypePos = 0,
TypeMask = 0x7f,
@@ -173,6 +174,11 @@ namespace HexagonII {
hasUnaryRestrictionMask = 0x1,
};
+ inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
+ unsigned Mask) {
+ return (MID.TSFlags >> Pos) & Mask;
+ }
+
// *** The code above must match HexagonInstrFormat*.td *** //
// Hexagon specific MO operand flag mask.
@@ -275,6 +281,10 @@ namespace HexagonII {
INST_ICLASS_ALU32_3 = 0xf0000000
};
+ inline bool isCVI(const MCInstrDesc &MID) {
+ return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
+ }
+
LLVM_ATTRIBUTE_UNUSED
static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
switch (S) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f25..3ff8994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+ if (Opcode == NVPTX::StoreRetvalI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
NVPTX::StoreParamI8, NVPTX::StoreParamI16,
NVPTX::StoreParamI32, NVPTX::StoreParamI64,
NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+ if (Opcode == NVPTX::StoreParamI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78..66a1010 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
#include <cmath>
#include <cstdint>
#include <iterator>
+#include <optional>
#include <sstream>
#include <string>
#include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
+static bool adjustElementType(EVT &ElementType) {
+ switch (ElementType.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::f16:
+ case MVT::bf16:
+ ElementType = MVT::i16;
+ return true;
+ case MVT::f32:
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ ElementType = MVT::i32;
+ return true;
+ case MVT::f64:
+ ElementType = MVT::i64;
+ return true;
+ }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue StVal, SDValue &InGlue,
+ unsigned ArgID, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+ // Store each byte
+ SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal, InGlue};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(
+ NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+ MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+ InGlue = Chain.getValue(1);
+ }
+ return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+ EVT ElementType, SDValue &InGlue,
+ SmallVectorImpl<SDValue> &TempProxyRegOps,
+ const SDLoc &dl) {
+ // Bit logic only works on integer types
+ EVT MergedType = ElementType;
+ adjustElementType(MergedType);
+
+ // Load each byte and construct the whole value. Initial value to 0
+ SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+ // LoadParamMemI8 loads into i16 register only
+ SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ InGlue};
+ // This will be selected to LoadParamMemI8
+ SDValue LdVal =
+ DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+ MVT::i8, MachinePointerInfo(), Align(1));
+ SDValue TmpLdVal = LdVal.getValue(0);
+ Chain = LdVal.getValue(1);
+ InGlue = LdVal.getValue(2);
+
+ TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+ TmpLdVal.getSimpleValueType(), TmpLdVal);
+ TempProxyRegOps.push_back(TmpLdVal);
+
+ SDValue CMask = DAG.getConstant(255, dl, MergedType);
+ SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+ // Need to extend the i16 register to the whole width.
+ TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+ // Mask off the high bits. Leave only the lower 8bits.
+ // Do this because we are using loadparam.b8.
+ TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+ // Shift and merge
+ TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+ RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+ }
+ if (ElementType != MergedType)
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ return RetVal;
+}
+
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (NeedAlign)
PartAlign = commonAlignment(ArgAlign, CurOffset);
- // New store.
- if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceding store.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(
- DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
- StoreOperands.push_back(DAG.getConstant(
- IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
- dl, MVT::i32));
- }
-
SDValue StVal = OutVals[OIdx];
MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+ PartAlign.value() <
+ DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+ assert(StoreOperands.empty() && "Unfinished preceeding store.");
+ Chain = LowerUnalignedStoreParam(
+ DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+ StVal, InGlue, ParamCount, dl);
+
+ // LowerUnalignedStoreParam took care of inserting the necessary nodes
+ // into the SDAG, so just move on to the next element.
+ if (!IsByVal)
+ ++OIdx;
+ continue;
+ }
+
+ // New store.
+ if (VectorInfo[j] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Unfinished preceding store.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(
+ DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+ StoreOperands.push_back(DAG.getConstant(
+ IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+ dl, MVT::i32));
+ }
+
// Record the value to store.
StoreOperands.push_back(StVal);
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 16> ProxyRegOps;
SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+ // An item of the vector is filled if the element does not need a ProxyReg
+ // operation on it and should be added to InVals as is. ProxyRegOps and
+ // ProxyRegTruncates contain empty/none items at the same index.
+ SmallVector<SDValue, 16> RetElts;
+ // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+ // to use the values of `LoadParam`s and to be replaced later then
+ // `CALLSEQ_END` is added.
+ SmallVector<SDValue, 16> TempProxyRegOps;
// Generate loads from param memory/moves from registers for result
if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
EltType = MVT::i16;
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar load. In such cases, fall back to byte loads.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+ EltAlign < DL.getABITypeAlign(
+ TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+ assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+ SDValue Ret = LowerUnalignedLoadRetParam(
+ DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+ ProxyRegOps.push_back(SDValue());
+ ProxyRegTruncates.push_back(std::optional<MVT>());
+ RetElts.resize(i);
+ RetElts.push_back(Ret);
+
+ continue;
+ }
+
// Record index of the very first element of the vector.
if (VectorInfo[i] & PVF_FIRST) {
assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+ if (i < RetElts.size() && RetElts[i]) {
+ InVals.push_back(RetElts[i]);
+ continue;
+ }
+
SDValue Ret = DAG.getNode(
NVPTXISD::ProxyReg, dl,
DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InVals.push_back(Ret);
}
+ for (SDValue &T : TempProxyRegOps) {
+ SDValue Repl = DAG.getNode(
+ NVPTXISD::ProxyReg, dl,
+ DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+ {Chain, T.getOperand(0), InGlue});
+ DAG.ReplaceAllUsesWith(T, Repl);
+ DAG.RemoveDeadNode(T.getNode());
+
+ Chain = Repl.getValue(1);
+ InGlue = Repl.getValue(2);
+ }
+
// set isTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
Value *srcValue = Constant::getNullValue(PointerType::get(
EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+ const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+ if (aggregateIsPacked)
+ return Align(1);
+ if (NumElts != 1)
+ return std::nullopt;
+ Align PartAlign =
+ (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+ ? PAL.getParamAlignment(i).value()
+ : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+ return commonAlignment(PartAlign, Offsets[parti]);
+ }();
SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
- MachinePointerInfo(srcValue),
- MaybeAlign(aggregateIsPacked ? 1 : 0),
+ MachinePointerInfo(srcValue), PartAlign,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
return Chain;
}
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue RetVal, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ // Store each byte
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), StoreOperands,
+ MVT::i8, MachinePointerInfo(), std::nullopt,
+ MachineMemOperand::MOStore);
+ }
+ return Chain;
+}
+
SDValue
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 6> StoreOperands;
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- // New load/store. Record chain and offset operands.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Orphaned operand list.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
- }
-
SDValue OutVal = OutVals[i];
SDValue RetVal = PromotedOutVals[i];
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
}
+ // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+ // for a scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+ EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+ Align ElementTypeAlign =
+ DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+ Align ElementAlign =
+ commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+ if (ElementAlign < ElementTypeAlign) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+ RetVal, dl);
+
+ // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+ // into the graph, so just move on to the next element.
+ continue;
+ }
+ }
+
+ // New load/store. Record chain and offset operands.
+ if (VectorInfo[i] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+ }
+
// Record the value to return.
StoreOperands.push_back(RetVal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955..b3517ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 904f1d7..c922098 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
MVT SubVecContainerVT = SubVecVT;
// Establish the correct scalable-vector types for any fixed-length type.
- if (SubVecVT.isFixedLengthVector())
+ if (SubVecVT.isFixedLengthVector()) {
+ assert(Idx == 0 && V.isUndef());
SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+ }
if (VT.isFixedLengthVector())
VT = TLI.getContainerForFixedLengthVector(VT);
@@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
MVT SubVecContainerVT = VT;
// Establish the correct scalable-vector types for any fixed-length type.
- if (VT.isFixedLengthVector())
+ if (VT.isFixedLengthVector()) {
+ assert(Idx == 0);
SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT);
+ }
if (InVT.isFixedLengthVector())
InVT = TLI.getContainerForFixedLengthVector(InVT);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7275eb..540c2e7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND,
ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN,
ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX,
- ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
+ ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
+ ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT,
+ ISD::VP_USUBSAT};
static const unsigned FloatingPointVPOps[] = {
ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL,
@@ -830,7 +832,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
VT, Custom);
setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,
Custom);
- setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT,
ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT},
VT, Legal);
@@ -956,6 +957,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// between vXf16 and vXf64 must be lowered as sequences which convert via
// vXf32.
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+ setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
// Custom-lower insert/extract operations to simplify patterns.
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
Custom);
@@ -3240,45 +3242,49 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
// Note that this method will also match potentially unappealing index
// sequences, like <i32 0, i32 50939494>, however it is left to the caller to
// determine whether this is worth generating code for.
-static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
- unsigned NumElts = Op.getNumOperands();
+static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
+ unsigned EltSizeInBits) {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
+ if (!cast<BuildVectorSDNode>(Op)->isConstant())
+ return std::nullopt;
bool IsInteger = Op.getValueType().isInteger();
std::optional<unsigned> SeqStepDenom;
std::optional<int64_t> SeqStepNum, SeqAddend;
std::optional<std::pair<uint64_t, unsigned>> PrevElt;
- unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
- for (unsigned Idx = 0; Idx < NumElts; Idx++) {
- // Assume undef elements match the sequence; we just have to be careful
- // when interpolating across them.
- if (Op.getOperand(Idx).isUndef())
+ assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
+
+ // First extract the ops into a list of constant integer values. This may not
+ // be possible for floats if they're not all representable as integers.
+ SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+ const unsigned OpSize = Op.getScalarValueSizeInBits();
+ for (auto [Idx, Elt] : enumerate(Op->op_values())) {
+ if (Elt.isUndef()) {
+ Elts[Idx] = std::nullopt;
continue;
-
- uint64_t Val;
+ }
if (IsInteger) {
- // The BUILD_VECTOR must be all constants.
- if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
- return std::nullopt;
- Val = Op.getConstantOperandVal(Idx) &
- maskTrailingOnes<uint64_t>(EltSizeInBits);
+ Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
} else {
- // The BUILD_VECTOR must be all constants.
- if (!isa<ConstantFPSDNode>(Op.getOperand(Idx)))
- return std::nullopt;
- if (auto ExactInteger = getExactInteger(
- cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
- EltSizeInBits))
- Val = *ExactInteger;
- else
+ auto ExactInteger =
+ getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
+ if (!ExactInteger)
return std::nullopt;
+ Elts[Idx] = *ExactInteger;
}
+ }
+
+ for (auto [Idx, Elt] : enumerate(Elts)) {
+ // Assume undef elements match the sequence; we just have to be careful
+ // when interpolating across them.
+ if (!Elt)
+ continue;
if (PrevElt) {
// Calculate the step since the last non-undef element, and ensure
// it's consistent across the entire sequence.
unsigned IdxDiff = Idx - PrevElt->second;
- int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+ int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
// A zero-value value difference means that we're somewhere in the middle
// of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3308,8 +3314,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
}
// Record this non-undef element for later.
- if (!PrevElt || PrevElt->first != Val)
- PrevElt = std::make_pair(Val, Idx);
+ if (!PrevElt || PrevElt->first != *Elt)
+ PrevElt = std::make_pair(*Elt, Idx);
}
// We need to have logged a step for this to count as a legal index sequence.
@@ -3318,21 +3324,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
// Loop back through the sequence and validate elements we might have skipped
// while waiting for a valid step. While doing this, log any sequence addend.
- for (unsigned Idx = 0; Idx < NumElts; Idx++) {
- if (Op.getOperand(Idx).isUndef())
+ for (auto [Idx, Elt] : enumerate(Elts)) {
+ if (!Elt)
continue;
- uint64_t Val;
- if (IsInteger) {
- Val = Op.getConstantOperandVal(Idx) &
- maskTrailingOnes<uint64_t>(EltSizeInBits);
- } else {
- Val = *getExactInteger(
- cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
- EltSizeInBits);
- }
uint64_t ExpectedVal =
(int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
- int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
+ int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
if (!SeqAddend)
SeqAddend = Addend;
else if (Addend != SeqAddend)
@@ -3598,7 +3595,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
// Try and match index sequences, which we can lower to the vid instruction
// with optional modifications. An all-undef vector is matched by
// getSplatValue, above.
- if (auto SimpleVID = isSimpleVIDSequence(Op)) {
+ if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
int64_t StepNumerator = SimpleVID->StepNumerator;
unsigned StepDenominator = SimpleVID->StepDenominator;
int64_t Addend = SimpleVID->Addend;
@@ -3853,11 +3850,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// If we're compiling for an exact VLEN value, we can split our work per
// register in the register group.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
- if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ if (const auto VLen = Subtarget.getRealVLen();
+ VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) {
MVT ElemVT = VT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
@@ -4768,9 +4764,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
// If we don't know exact data layout, not much we can do. If this
// is already m1 or smaller, no point in splitting further.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
- if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
+ const auto VLen = Subtarget.getRealVLen();
+ if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen)
return SDValue();
// Avoid picking up bitrotate patterns which we have a linear-in-lmul
@@ -4781,7 +4776,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
return SDValue();
MVT ElemVT = VT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
SmallVector<std::pair<int, SmallVector<int>>>
@@ -5759,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) {
VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP
VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP
VP_CASE(BITREVERSE) // VP_BITREVERSE
+ VP_CASE(SADDSAT) // VP_SADDSAT
+ VP_CASE(UADDSAT) // VP_UADDSAT
+ VP_CASE(SSUBSAT) // VP_SSUBSAT
+ VP_CASE(USUBSAT) // VP_USUBSAT
VP_CASE(BSWAP) // VP_BSWAP
VP_CASE(CTLZ) // VP_CTLZ
VP_CASE(CTTZ) // VP_CTTZ
@@ -6798,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_UDIV:
case ISD::VP_SREM:
case ISD::VP_UREM:
+ case ISD::VP_UADDSAT:
+ case ISD::VP_USUBSAT:
+ case ISD::VP_SADDSAT:
+ case ISD::VP_SSUBSAT:
return lowerVPOp(Op, DAG);
case ISD::VP_AND:
case ISD::VP_OR:
@@ -7384,6 +7387,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
return V;
+ // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
+ // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
+ if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
+ const APInt &TrueVal = TrueV->getAsAPIntVal();
+ const APInt &FalseVal = FalseV->getAsAPIntVal();
+ const int TrueValCost = RISCVMatInt::getIntMatCost(
+ TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ const int FalseValCost = RISCVMatInt::getIntMatCost(
+ FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+ SDValue LHSVal = DAG.getConstant(
+ IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
+ SDValue RHSVal =
+ DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
+ SDValue CMOV =
+ DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+ DL, VT, LHSVal, CondV);
+ return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+ }
+
// (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
// Unless we have the short forward branch optimization.
if (!Subtarget.hasConditionalMoveFusion())
@@ -8313,15 +8336,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
// constant index, we can always perform the extract in m1 (or
// smaller) as we can determine the register corresponding to
// the index in the register group.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ const auto VLen = Subtarget.getRealVLen();
if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
- IdxC && MinVLen == MaxVLen &&
- VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
MVT M1VT = getLMUL1VT(ContainerVT);
unsigned OrigIdx = IdxC->getZExtValue();
EVT ElemVT = VecVT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
unsigned RemIdx = OrigIdx % ElemsPerVReg;
unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
unsigned ExtractIdx =
@@ -9782,15 +9803,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
if (OrigIdx == 0)
return Op;
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ const auto VLen = Subtarget.getRealVLen();
// If the subvector vector is a fixed-length type and we don't know VLEN
// exactly, we cannot use subregister manipulation to simplify the codegen; we
// don't know which register of a LMUL group contains the specific subvector
// as we only know the minimum register size. Therefore we must slide the
// vector group down the full amount.
- if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
+ if (SubVecVT.isFixedLengthVector() && !VLen) {
MVT ContainerVT = VecVT;
if (VecVT.isFixedLengthVector()) {
ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9837,8 +9857,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
// and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
// we have a fixed length subvector, we need to adjust the index by 1/vscale.
if (SubVecVT.isFixedLengthVector()) {
- assert(MinVLen == MaxVLen);
- unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+ assert(VLen);
+ unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
auto Decompose =
RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
@@ -12872,6 +12892,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineSubOfBoolean(N, DAG))
return V;
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
@@ -12879,7 +12900,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
isNullConstant(N1.getOperand(1))) {
ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
if (CCVal == ISD::SETLT) {
- EVT VT = N->getValueType(0);
SDLoc DL(N);
unsigned ShAmt = N0.getValueSizeInBits() - 1;
return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0),
@@ -12887,6 +12907,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // sub (zext, zext) -> sext (sub (zext, zext))
+ // where the sum of the extend widths match, and the inner zexts
+ // add at least one bit. (For profitability on rvv, we use a
+ // power of two for both inner and outer extend.)
+ if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) &&
+ N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND &&
+ N0.hasOneUse() && N1.hasOneUse()) {
+ SDValue Src0 = N0.getOperand(0);
+ SDValue Src1 = N1.getOperand(0);
+ EVT SrcVT = Src0.getValueType();
+ if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) &&
+ SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 &&
+ SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) {
+ LLVMContext &C = *DAG.getContext();
+ EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C);
+ EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount());
+ Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0);
+ Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1);
+ return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
+ DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1));
+ }
+ }
+
// fold (sub x, (select lhs, rhs, cc, 0, y)) ->
// (select lhs, rhs, cc, x, (sub x, y))
return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget);
@@ -15978,7 +16021,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (Index.getOpcode() == ISD::BUILD_VECTOR &&
MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
- if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
+ // The sequence will be XLenVT, not the type of Index. Tell
+ // isSimpleVIDSequence this so we avoid overflow.
+ if (std::optional<VIDSequence> SimpleVID =
+ isSimpleVIDSequence(Index, Subtarget.getXLen());
SimpleVID && SimpleVID->StepDenominator == 1) {
const int64_t StepNumerator = SimpleVID->StepNumerator;
const int64_t Addend = SimpleVID->Addend;
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index ff21fe1..af864ba 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) {
// Return true if MI is a load for which there exists a compressed version.
static bool isCompressibleLoad(const MachineInstr &MI) {
const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
- const unsigned Opcode = MI.getOpcode();
- return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) ||
- Opcode == RISCV::LD || Opcode == RISCV::FLD;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::LW:
+ case RISCV::LD:
+ return STI.hasStdExtCOrZca();
+ case RISCV::FLW:
+ return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+ case RISCV::FLD:
+ return STI.hasStdExtCOrZcd();
+ }
}
// Return true if MI is a store for which there exists a compressed version.
static bool isCompressibleStore(const MachineInstr &MI) {
const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
- const unsigned Opcode = MI.getOpcode();
- return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) ||
- Opcode == RISCV::SD || Opcode == RISCV::FSD;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::SW:
+ case RISCV::SD:
+ return STI.hasStdExtCOrZca();
+ case RISCV::FSW:
+ return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+ case RISCV::FSD:
+ return STI.hasStdExtCOrZcd();
+ }
}
// Find a single register and/or large offset which, if compressible, would
@@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
const RISCVInstrInfo &TII = *STI.getInstrInfo();
// This optimization only makes sense if compressed instructions are emitted.
- // FIXME: Support Zca, Zcf, Zcd granularity.
- if (!STI.hasStdExtC())
+ if (!STI.hasStdExtCOrZca())
return false;
for (MachineBasicBlock &MBB : Fn) {
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index d15cb61..0be681d 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
let ReleaseAtCycles = noPredReleaseCycles;
}
+ // Define SchedVars
+ def nameMX # PredSchedVar
+ : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>;
+ def nameMX # NoPredSchedVar
+ : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>;
+ // Allow multiclass to refer to SchedVars -- need to have NAME prefix.
+ defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar);
+ defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar);
+
// Tie behavior to predicate
- def NAME # nameMX # "_Variant" : SchedWriteVariant<[
- SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
- SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
- ]>;
+ def NAME # nameMX # "_Variant"
+ : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
def : SchedAlias<
!cast<SchedReadWrite>(nameMX),
!cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
if IsWorstCase then {
- def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[
- SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
- SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
- ]>;
+ def NAME # name # "_WorstCase_Variant"
+ : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
def : SchedAlias<
!cast<SchedReadWrite>(name # "_WorstCase"),
!cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b60d7a..9ebf278 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -143,6 +143,10 @@ public:
#include "RISCVGenSubtargetInfo.inc"
bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; }
+ bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; }
+ bool hasStdExtCOrZcfOrZce() const {
+ return HasStdExtC || HasStdExtZcf || HasStdExtZce;
+ }
bool hasStdExtZvl() const { return ZvlLen != 0; }
bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; }
bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index adef40e..3e20e45 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination(
static cl::opt<bool>
EnableSinkFold("riscv-enable-sink-fold",
cl::desc("Enable sinking and folding of instruction copies"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
static cl::opt<bool>
EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden,
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index e6e3560..28a63b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> {
!eq(operation, OpGroupNonUniformShuffleDown),
!eq(operation, OpGroupBroadcast),
!eq(operation, OpGroupNonUniformBroadcast),
- !eq(operation, OpGroupNonUniformBroadcastFirst));
+ !eq(operation, OpGroupNonUniformBroadcastFirst),
+ !eq(operation, OpGroupNonUniformRotateKHR));
bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical);
}
@@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
+// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate
+defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+
// cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions
defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>;
defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index cc438b2..10569ef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
SPIRVGlobalRegistry *GR,
- MachineIRBuilder &MIRBuilder) {
+ MachineIRBuilder &MIRBuilder,
+ const SPIRVSubtarget &ST) {
// Read argument's access qualifier from metadata or default.
SPIRV::AccessQualifier::AccessQualifier ArgAccessQual =
getArgAccessQual(F, ArgIdx);
@@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
if (MDTypeStr.ends_with("*"))
ResArgType = GR->getOrCreateSPIRVTypeByName(
MDTypeStr, MIRBuilder,
- addressSpaceToStorageClass(
- OriginalArgType->getPointerAddressSpace()));
+ addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(),
+ ST));
else if (MDTypeStr.ends_with("_t"))
ResArgType = GR->getOrCreateSPIRVTypeByName(
"opencl." + MDTypeStr.str(), MIRBuilder,
@@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
assert(GR && "Must initialize the SPIRV type registry before lowering args.");
GR->setCurrentFunc(MIRBuilder.getMF());
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+
// Assign types and names to all args, and store their types for later.
FunctionType *FTy = getOriginalFunctionType(F);
SmallVector<SPIRVType *, 4> ArgTypeVRegs;
@@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
// TODO: handle the case of multiple registers.
if (VRegs[i].size() > 1)
return false;
- auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder);
+ auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST);
GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF());
ArgTypeVRegs.push_back(SpirvTy);
@@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (F.hasName())
buildOpName(FuncVReg, F.getName(), MIRBuilder);
- // Get access to information about available extensions
- const auto *ST =
- static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-
// Handle entry points and function linkage.
if (isEntryPoint(F)) {
const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47fec74..a1cb630 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
// TODO: change the implementation once opaque pointers are supported
// in the SPIR-V specification.
SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
- auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+ auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST);
// Null pointer means we have a loop in type definitions, make and
// return corresponding OpTypeForwardPointer.
if (SpvElementType == nullptr) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index f317b26..d34f802 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -31,6 +31,9 @@ public:
return true;
}
+ // prevent creation of jump tables
+ bool areJTsAllowed(const Function *) const override { return false; }
+
// This is to prevent sexts of non-i64 vector indices which are generated
// within general IRTranslator hence type generation for it is omitted.
MVT getVectorIdxTy(const DataLayout &DL) const override {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 0f11bc3..7c5252e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor
"$r = OpGenericCastToPtrExplicit $t $p $s">;
def OpBitcast : UnOp<"OpBitcast", 124>;
+// SPV_INTEL_usm_storage_classes
+def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>;
+def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>;
+
// 3.42.12 Composite Instructions
def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
@@ -765,6 +769,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
+// SPV_KHR_subgroup_rotate
+def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
+ (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
+ "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
+
// 3.49.7, Constant-Creation Instructions
// - SPV_INTEL_function_pointers
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 53d19a1..7258d3b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) {
}
}
+static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
+ switch (SC) {
+ case SPIRV::StorageClass::DeviceOnlyINTEL:
+ case SPIRV::StorageClass::HostOnlyINTEL:
+ return true;
+ default:
+ return false;
+ }
+}
+
// In SPIR-V address space casting can only happen to and from the Generic
-// storage class. We can also only case Workgroup, CrossWorkgroup, or Function
+// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function
// pointers to and from Generic pointers. As such, we can convert e.g. from
// Workgroup to Function by going via a Generic pointer as an intermediary. All
// other combinations can only be done by a bitcast, and are probably not safe.
@@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
- // Casting from an eligable pointer to Generic.
+ // don't generate a cast between identical storage classes
+ if (SrcSC == DstSC)
+ return true;
+
+ // Casting from an eligible pointer to Generic.
if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
- // Casting from Generic to an eligable pointer.
+ // Casting from Generic to an eligible pointer.
if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC))
return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr);
- // Casting between 2 eligable pointers using Generic as an intermediary.
+ // Casting between 2 eligible pointers using Generic as an intermediary.
if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
@@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
.addUse(Tmp)
.constrainAllUses(TII, TRI, RBI);
}
+
+ // Check if instructions from the SPV_INTEL_usm_storage_classes extension may
+ // be applied
+ if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup)
+ return selectUnOp(ResVReg, ResType, I,
+ SPIRV::OpPtrCastToCrossWorkgroupINTEL);
+ if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC))
+ return selectUnOp(ResVReg, ResType, I,
+ SPIRV::OpCrossWorkgroupCastToPtrINTEL);
+
// TODO Should this case just be disallowed completely?
// We're casting 2 other arbitrary address spaces, so have to bitcast.
return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
@@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
}
SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(
PointerBaseType, I, TII,
- addressSpaceToStorageClass(GV->getAddressSpace()));
+ addressSpaceToStorageClass(GV->getAddressSpace(), STI));
std::string GlobalIdent;
if (!GV->hasName()) {
@@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
unsigned AddrSpace = GV->getAddressSpace();
SPIRV::StorageClass::StorageClass Storage =
- addressSpaceToStorageClass(AddrSpace);
+ addressSpaceToStorageClass(AddrSpace, STI);
bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
Storage != SPIRV::StorageClass::Function;
SPIRV::LinkageType::LinkageType LnkType =
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 011a550..4f2e7a2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
const LLT p2 = LLT::pointer(2, PSize); // UniformConstant
const LLT p3 = LLT::pointer(3, PSize); // Workgroup
const LLT p4 = LLT::pointer(4, PSize); // Generic
- const LLT p5 = LLT::pointer(5, PSize); // Input
+ const LLT p5 =
+ LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device)
+ const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host)
// TODO: remove copy-pasting here by using concatenation in some way.
auto allPtrsScalarsAndVectors = {
- p0, p1, p2, p3, p4, p5, s1, s8, s16,
- s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8,
- v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1,
- v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+ p0, p1, p2, p3, p4, p5, p6, s1, s8, s16,
+ s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16,
+ v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16,
+ v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
auto allScalarsAndVectors = {
s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64,
@@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
auto allFloatAndIntScalars = allIntScalars;
- auto allPtrs = {p0, p1, p2, p3, p4, p5};
- auto allWritablePtrs = {p0, p1, p3, p4};
+ auto allPtrs = {p0, p1, p2, p3, p4, p5, p6};
+ auto allWritablePtrs = {p0, p1, p3, p4, p5, p6};
for (auto Opc : TypeFoldingSupportingOpcs)
getActionDefinitionsBuilder(Opc).custom();
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index dbda287..3be28c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1063,12 +1063,28 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
}
break;
+ case SPIRV::OpPtrCastToCrossWorkgroupINTEL:
+ case SPIRV::OpCrossWorkgroupCastToPtrINTEL:
+ if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) {
+ Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes);
+ Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL);
+ }
+ break;
case SPIRV::OpConstantFunctionPointerINTEL:
if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
}
break;
+ case SPIRV::OpGroupNonUniformRotateKHR:
+ if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate))
+ report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the "
+ "following SPIR-V extension: SPV_KHR_subgroup_rotate",
+ false);
+ Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate);
+ Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR);
+ Reqs.addCapability(SPIRV::Capability::GroupNonUniform);
+ break;
case SPIRV::OpGroupIMulKHR:
case SPIRV::OpGroupFMulKHR:
case SPIRV::OpGroupBitwiseAndKHR:
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cbc16fa..1442168 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineIRBuilder MIB) {
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
SmallVector<MachineInstr *, 10> ToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
@@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
- addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+ addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST));
// If the bitcast would be redundant, replace all uses with the source
// register.
@@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineIRBuilder MIB) {
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
+
MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<MachineInstr *, 10> ToErase;
@@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB);
SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
- addressSpaceToStorageClass(MI.getOperand(3).getImm()));
+ addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST));
MachineInstr *Def = MRI.getVRegDef(Reg);
assert(Def && "Expecting an instruction that defines the register");
insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index e186154..79f1614 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions(
clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
"Adds OptNoneINTEL value for Function Control mask that "
"indicates a request to not optimize the function."),
+ clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes,
+ "SPV_INTEL_usm_storage_classes",
+ "Introduces two new storage classes that are sub classes of "
+ "the CrossWorkgroup storage class "
+ "that provides additional information that can enable "
+ "optimization."),
clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
"Allows work items in a subgroup to share data without the "
"use of local memory and work group barriers, and to "
@@ -75,6 +81,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
"Allows to use the LinkOnceODR linkage type that is to let "
"a function or global variable to be merged with other functions "
"or global variables of the same name when linkage occurs."),
+ clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate,
+ "SPV_KHR_subgroup_rotate",
+ "Adds a new instruction that enables rotating values across "
+ "invocations within a subgroup."),
clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
"SPV_INTEL_function_pointers",
"Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 4e5ac0d..b022b97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>;
@@ -462,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom
defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
+defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
@@ -699,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
+defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>;
+defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Dim enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 05f766d..169d7cc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/SPIRVBaseInfo.h"
#include "SPIRV.h"
#include "SPIRVInstrInfo.h"
+#include "SPIRVSubtarget.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
return 3;
case SPIRV::StorageClass::Generic:
return 4;
+ case SPIRV::StorageClass::DeviceOnlyINTEL:
+ return 5;
+ case SPIRV::StorageClass::HostOnlyINTEL:
+ return 6;
case SPIRV::StorageClass::Input:
return 7;
default:
- llvm_unreachable("Unable to get address space id");
+ report_fatal_error("Unable to get address space id");
}
}
SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace) {
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
switch (AddrSpace) {
case 0:
return SPIRV::StorageClass::Function;
@@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) {
return SPIRV::StorageClass::Workgroup;
case 4:
return SPIRV::StorageClass::Generic;
+ case 5:
+ return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+ ? SPIRV::StorageClass::DeviceOnlyINTEL
+ : SPIRV::StorageClass::CrossWorkgroup;
+ case 6:
+ return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+ ? SPIRV::StorageClass::HostOnlyINTEL
+ : SPIRV::StorageClass::CrossWorkgroup;
case 7:
return SPIRV::StorageClass::Input;
default:
- llvm_unreachable("Unknown address space");
+ report_fatal_error("Unknown address space");
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index a33dc02..1af53dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -27,6 +27,7 @@ class MachineRegisterInfo;
class Register;
class StringRef;
class SPIRVInstrInfo;
+class SPIRVSubtarget;
// Add the given string as a series of integer operand, inserting null
// terminators and padding to make sure the operands all have 32-bit
@@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC);
// Convert an LLVM IR address space to a SPIR-V storage class.
SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace);
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI);
SPIRV::MemorySemantics::MemorySemantics
getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790..36f0679 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,6 +43,8 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-lower"
+extern cl::opt<bool> WasmEmitMultiValue;
+
WebAssemblyTargetLowering::WebAssemblyTargetLowering(
const TargetMachine &TM, const WebAssemblySubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext & /*Context*/) const {
// WebAssembly can only handle returning tuples with multivalue enabled
- return Subtarget->hasMultivalue() || Outs.size() <= 1;
+ return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
}
SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
- assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+ assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
+ Outs.size() <= 1) &&
"MVP WebAssembly can only return up to one value");
if (!callingConvSupported(CallConv))
fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e95911..b969b83 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,6 +22,8 @@
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
+extern cl::opt<bool> WasmEmitMultiValue;
+
WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
if (Results.size() > 1 &&
- !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
+ (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
+ !WasmEmitMultiValue)) {
// WebAssembly can't lower returns of multiple values without demoting to
// sret unless multivalue is enabled (see
// WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029..2a84c90 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,6 +24,8 @@
using namespace llvm;
+extern cl::opt<bool> WasmEmitMultiValue;
+
namespace {
enum RuntimeLibcallSignature {
@@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(PtrTy);
break;
case i64_i64_func_f32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::F32);
break;
case i64_i64_func_f64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::F64);
break;
case i16_i16_func_i16_i16:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
} else {
@@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i32_i32_func_i32_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
} else {
@@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i64_i64_func_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64_iPTR:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(PtrTy);
break;
case i64_i64_i64_i64_func_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
@@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i64_i64_func_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 42043a7..3120b6b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
" irreducible control flow optimization pass"),
cl::init(false));
+// A temporary option to control emission of multivalue until multivalue
+// implementation is stable enough. We currently don't emit multivalue by
+// default even if the feature section allows it.
+// TODO Stabilize multivalue and delete this option
+cl::opt<bool>
+ WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
+ cl::desc("WebAssembly: Emit multivalue in the backend"),
+ cl::init(false));
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
// Register the target.
RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 4a11dd2..a620ba9 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
} // namespace
Error X86TargetMachine::buildCodeGenPipeline(
- ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
- MachineFunctionAnalysisManager &, raw_pwrite_stream &Out,
- raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
- CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) {
+ ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+ CodeGenFileType FileType, CGPassBuilderOption Opt,
+ PassInstrumentationCallbacks *PIC) {
auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
- return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType);
+ return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index f31c971..0fd3e47 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,10 +58,9 @@ public:
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
- Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &,
- MachineFunctionAnalysisManager &,
- raw_pwrite_stream &, raw_pwrite_stream *,
- CodeGenFileType, CGPassBuilderOption,
+ Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+ raw_pwrite_stream *, CodeGenFileType,
+ CGPassBuilderOption,
PassInstrumentationCallbacks *) override;
bool isJIT() const { return IsJIT; }