aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/AliasAnalysis.cpp2
-rwxr-xr-xllvm/lib/Analysis/ConstantFolding.cpp44
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp248
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp6
-rw-r--r--llvm/lib/BinaryFormat/Dwarf.cpp7
-rw-r--r--llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp4
-rw-r--r--llvm/lib/CGData/OutlinedHashTreeRecord.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp3
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp2
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineOperand.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineStableHash.cpp3
-rw-r--r--llvm/lib/CodeGen/RegAllocFast.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp20
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp25
-rw-r--r--llvm/lib/CodeGenTypes/LowLevelType.cpp6
-rw-r--r--llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp3
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt4
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp370
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp1161
-rw-r--r--llvm/lib/FileCheck/FileCheckImpl.h2
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp13
-rw-r--r--llvm/lib/MC/MCParser/AsmParser.cpp10
-rw-r--r--llvm/lib/MC/MCParser/ELFAsmParser.cpp18
-rw-r--r--llvm/lib/MC/MCParser/MasmParser.cpp16
-rw-r--r--llvm/lib/Object/WindowsMachineFlag.cpp4
-rw-r--r--llvm/lib/Remarks/RemarkFormat.cpp2
-rw-r--r--llvm/lib/Support/AArch64BuildAttributes.cpp4
-rw-r--r--llvm/lib/Support/APFloat.cpp6
-rw-r--r--llvm/lib/Support/BranchProbability.cpp7
-rw-r--r--llvm/lib/Support/Windows/Signals.inc7
-rw-r--r--llvm/lib/Support/raw_ostream.cpp11
-rw-r--r--llvm/lib/Support/raw_socket_stream.cpp2
-rw-r--r--llvm/lib/TableGen/TGLexer.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp13
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp84
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp84
-rw-r--r--llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp13
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp6
-rw-r--r--llvm/lib/Target/CSKY/CSKYISelLowering.cpp74
-rw-r--r--llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp3
-rw-r--r--llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepIICHVX.td132
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td391
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td116
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td13
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp8
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td6
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td5
-rw-r--r--llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp6
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp129
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td118
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h21
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp70
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoD.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoF.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp31
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp61
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h3
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp80
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86Operand.h31
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp5
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h7
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp19
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h1
-rw-r--r--llvm/lib/Target/X86/X86.td6
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp190
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp40
-rw-r--r--llvm/lib/Target/X86/X86FastTileConfig.cpp25
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp78
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp344
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td208
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp13
-rw-r--r--llvm/lib/Target/X86/X86InstrOperands.td7
-rw-r--r--llvm/lib/Target/X86/X86InstrPredicates.td1
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp203
-rw-r--r--llvm/lib/Target/X86/X86PreTileConfig.cpp26
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp70
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td9
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/X86/X86TileConfig.cpp83
-rw-r--r--llvm/lib/TargetParser/Host.cpp1
-rw-r--r--llvm/lib/TargetParser/PPCTargetParser.cpp8
-rw-r--r--llvm/lib/TargetParser/X86TargetParser.cpp3
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp35
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp143
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp30
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp59
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp101
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp48
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp29
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h27
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp18
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp21
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h46
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp66
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp19
134 files changed, 3570 insertions, 2160 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index f2dc25f..26a5602 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -75,7 +75,7 @@ AAResults::AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {}
AAResults::AAResults(AAResults &&Arg)
: TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {}
-AAResults::~AAResults() {}
+AAResults::~AAResults() = default;
bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &Inv) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index e9e2e7d..da32542 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
}
Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
- FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType());
- if (!VT)
- return nullptr;
-
- // This isn't strictly necessary, but handle the special/common case of zero:
- // all integer reductions of a zero input produce zero.
- if (isa<ConstantAggregateZero>(Op))
- return ConstantInt::get(VT->getElementType(), 0);
+ auto *OpVT = cast<VectorType>(Op->getType());
// This is the same as the underlying binops - poison propagates.
- if (isa<PoisonValue>(Op) || Op->containsPoisonElement())
- return PoisonValue::get(VT->getElementType());
+ if (Op->containsPoisonElement())
+ return PoisonValue::get(OpVT->getElementType());
+
+ // Shortcut non-accumulating reductions.
+ if (Constant *SplatVal = Op->getSplatValue()) {
+ switch (IID) {
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ return SplatVal;
+ case Intrinsic::vector_reduce_add:
+ if (SplatVal->isNullValue())
+ return SplatVal;
+ break;
+ case Intrinsic::vector_reduce_mul:
+ if (SplatVal->isNullValue() || SplatVal->isOneValue())
+ return SplatVal;
+ break;
+ case Intrinsic::vector_reduce_xor:
+ if (SplatVal->isNullValue())
+ return SplatVal;
+ if (OpVT->getElementCount().isKnownMultipleOf(2))
+ return Constant::getNullValue(OpVT->getElementType());
+ break;
+ }
+ }
+
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(OpVT);
+ if (!VT)
+ return nullptr;
// TODO: Handle undef.
auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 7597f3a..a31f17b 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2424,10 +2424,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp(
// We're trying to construct a SCEV of type `Type' with `Ops' as operands and
// `OldFlags' as can't-wrap behavior. Infer a more aggressive set of
// can't-overflow flags for the operation if possible.
-static SCEV::NoWrapFlags
-StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
- const ArrayRef<const SCEV *> Ops,
- SCEV::NoWrapFlags Flags) {
+static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE,
+ SCEVTypes Type,
+ ArrayRef<const SCEV *> Ops,
+ SCEV::NoWrapFlags Flags) {
using namespace std::placeholders;
using OBO = OverflowingBinaryOperator;
@@ -2540,7 +2540,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
unsigned Idx = isa<SCEVConstant>(Ops[0]) ? 1 : 0;
// Delay expensive flag strengthening until necessary.
- auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+ auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags);
};
@@ -3125,7 +3125,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
return Folded;
// Delay expensive flag strengthening until necessary.
- auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+ auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags);
};
@@ -15510,6 +15510,78 @@ static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr,
return SE.getConstant(*ExprVal + DivisorVal - Rem);
}
+static bool collectDivisibilityInformation(
+ ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS,
+ DenseMap<const SCEV *, const SCEV *> &DivInfo,
+ DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) {
+ // If we have LHS == 0, check if LHS is computing a property of some unknown
+ // SCEV %v which we can rewrite %v to express explicitly.
+ if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero()))
+ return false;
+ // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+ // explicitly express that.
+ const SCEVUnknown *URemLHS = nullptr;
+ const SCEV *URemRHS = nullptr;
+ if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE)))
+ return false;
+
+ const SCEV *Multiple =
+ SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS);
+ DivInfo[URemLHS] = Multiple;
+ if (auto *C = dyn_cast<SCEVConstant>(URemRHS))
+ Multiples[URemLHS] = C->getAPInt();
+ return true;
+}
+
+// Check if the condition is a divisibility guard (A % B == 0).
+static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS,
+ ScalarEvolution &SE) {
+ const SCEV *X, *Y;
+ return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero();
+}
+
+// Apply divisibility by \p Divisor on MinMaxExpr with constant values,
+// recursively. This is done by aligning up/down the constant value to the
+// Divisor.
+static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr,
+ APInt Divisor,
+ ScalarEvolution &SE) {
+ // Return true if \p Expr is a MinMax SCEV expression with a non-negative
+ // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
+ // the non-constant operand and in \p LHS the constant operand.
+ auto IsMinMaxSCEVWithNonNegativeConstant =
+ [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
+ const SCEV *&RHS) {
+ if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
+ if (MinMax->getNumOperands() != 2)
+ return false;
+ if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
+ if (C->getAPInt().isNegative())
+ return false;
+ SCTy = MinMax->getSCEVType();
+ LHS = MinMax->getOperand(0);
+ RHS = MinMax->getOperand(1);
+ return true;
+ }
+ }
+ return false;
+ };
+
+ const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
+ SCEVTypes SCTy;
+ if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
+ MinMaxRHS))
+ return MinMaxExpr;
+ auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
+ assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!");
+ auto *DivisibleExpr =
+ IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE)
+ : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE);
+ SmallVector<const SCEV *> Ops = {
+ applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr};
+ return SE.getMinMaxExpr(SCTy, Ops);
+}
+
void ScalarEvolution::LoopGuards::collectFromBlock(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const BasicBlock *Block, const BasicBlock *Pred,
@@ -15520,19 +15592,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
SmallVector<const SCEV *> ExprsToRewrite;
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS,
- DenseMap<const SCEV *, const SCEV *>
- &RewriteMap) {
+ DenseMap<const SCEV *, const SCEV *> &RewriteMap,
+ const LoopGuards &DivGuards) {
// WARNING: It is generally unsound to apply any wrap flags to the proposed
// replacement SCEV which isn't directly implied by the structure of that
// SCEV. In particular, using contextual facts to imply flags is *NOT*
// legal. See the scoping rules for flags in the header to understand why.
- // If LHS is a constant, apply information to the other expression.
- if (isa<SCEVConstant>(LHS)) {
- std::swap(LHS, RHS);
- Predicate = CmpInst::getSwappedPredicate(Predicate);
- }
-
// Check for a condition of the form (-C1 + X < C2). InstCombine will
// create this form when combining two checks of the form (X u< C2 + C1) and
// (X >=u C1).
@@ -15565,67 +15631,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
if (MatchRangeCheckIdiom())
return;
- // Return true if \p Expr is a MinMax SCEV expression with a non-negative
- // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
- // the non-constant operand and in \p LHS the constant operand.
- auto IsMinMaxSCEVWithNonNegativeConstant =
- [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
- const SCEV *&RHS) {
- const APInt *C;
- SCTy = Expr->getSCEVType();
- return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
- match(LHS, m_scev_APInt(C)) && C->isNonNegative();
- };
-
- // Apply divisibilty by \p Divisor on MinMaxExpr with constant values,
- // recursively. This is done by aligning up/down the constant value to the
- // Divisor.
- std::function<const SCEV *(const SCEV *, const SCEV *)>
- ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr,
- const SCEV *Divisor) {
- auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor);
- if (!ConstDivisor)
- return MinMaxExpr;
- const APInt &DivisorVal = ConstDivisor->getAPInt();
-
- const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
- SCEVTypes SCTy;
- if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
- MinMaxRHS))
- return MinMaxExpr;
- auto IsMin =
- isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
- assert(SE.isKnownNonNegative(MinMaxLHS) &&
- "Expected non-negative operand!");
- auto *DivisibleExpr =
- IsMin
- ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE)
- : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE);
- SmallVector<const SCEV *> Ops = {
- ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr};
- return SE.getMinMaxExpr(SCTy, Ops);
- };
-
- // If we have LHS == 0, check if LHS is computing a property of some unknown
- // SCEV %v which we can rewrite %v to express explicitly.
- if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
- // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
- // explicitly express that.
- const SCEVUnknown *URemLHS = nullptr;
- const SCEV *URemRHS = nullptr;
- if (match(LHS,
- m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
- auto I = RewriteMap.find(URemLHS);
- const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
- RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
- const auto *Multiple =
- SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
- RewriteMap[URemLHS] = Multiple;
- ExprsToRewrite.push_back(URemLHS);
- return;
- }
- }
-
// Do not apply information for constants or if RHS contains an AddRec.
if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS))
return;
@@ -15655,7 +15660,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
};
const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
- const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS);
+ // Apply divisibility information when computing the constant multiple.
+ const APInt &DividesBy =
+ SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS));
// Collect rewrites for LHS and its transitive operands based on the
// condition.
@@ -15670,31 +15677,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
// predicate.
const SCEV *One = SE.getOne(RHS->getType());
switch (Predicate) {
- case CmpInst::ICMP_ULT:
- if (RHS->getType()->isPointerTy())
- return;
- RHS = SE.getUMaxExpr(RHS, One);
- [[fallthrough]];
- case CmpInst::ICMP_SLT: {
- RHS = SE.getMinusSCEV(RHS, One);
- RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
- break;
- }
- case CmpInst::ICMP_UGT:
- case CmpInst::ICMP_SGT:
- RHS = SE.getAddExpr(RHS, One);
- RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
- break;
- case CmpInst::ICMP_ULE:
- case CmpInst::ICMP_SLE:
- RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
- break;
- case CmpInst::ICMP_UGE:
- case CmpInst::ICMP_SGE:
- RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
- break;
- default:
- break;
+ case CmpInst::ICMP_ULT:
+ if (RHS->getType()->isPointerTy())
+ return;
+ RHS = SE.getUMaxExpr(RHS, One);
+ [[fallthrough]];
+ case CmpInst::ICMP_SLT: {
+ RHS = SE.getMinusSCEV(RHS, One);
+ RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+ break;
+ }
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_SGT:
+ RHS = SE.getAddExpr(RHS, One);
+ RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+ break;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_SLE:
+ RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+ break;
+ case CmpInst::ICMP_UGE:
+ case CmpInst::ICMP_SGE:
+ RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+ break;
+ default:
+ break;
}
SmallVector<const SCEV *, 16> Worklist(1, LHS);
@@ -15840,8 +15847,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
// Now apply the information from the collected conditions to
// Guards.RewriteMap. Conditions are processed in reverse order, so the
- // earliest conditions is processed first. This ensures the SCEVs with the
+ // earliest conditions is processed first, except guards with divisibility
+ // information, which are moved to the back. This ensures the SCEVs with the
// shortest dependency chains are constructed first.
+ SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>>
+ GuardsToProcess;
for (auto [Term, EnterIfTrue] : reverse(Terms)) {
SmallVector<Value *, 8> Worklist;
SmallPtrSet<Value *, 8> Visited;
@@ -15856,7 +15866,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate();
const auto *LHS = SE.getSCEV(Cmp->getOperand(0));
const auto *RHS = SE.getSCEV(Cmp->getOperand(1));
- CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap);
+ // If LHS is a constant, apply information to the other expression.
+ // TODO: If LHS is not a constant, check if using CompareSCEVComplexity
+ // can improve results.
+ if (isa<SCEVConstant>(LHS)) {
+ std::swap(LHS, RHS);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ GuardsToProcess.emplace_back(Predicate, LHS, RHS);
continue;
}
@@ -15869,6 +15886,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
}
}
+ // Process divisibility guards in reverse order to populate DivGuards early.
+ DenseMap<const SCEV *, APInt> Multiples;
+ LoopGuards DivGuards(SE);
+ for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) {
+ if (!isDivisibilityGuard(LHS, RHS, SE))
+ continue;
+ collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap,
+ Multiples, SE);
+ }
+
+ for (const auto &[Predicate, LHS, RHS] : GuardsToProcess)
+ CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards);
+
+ // Apply divisibility information last. This ensures it is applied to the
+ // outermost expression after other rewrites for the given value.
+ for (const auto &[K, Divisor] : Multiples) {
+ const SCEV *DivisorSCEV = SE.getConstant(Divisor);
+ Guards.RewriteMap[K] =
+ SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr(
+ Guards.rewrite(K), Divisor, SE),
+ DivisorSCEV),
+ DivisorSCEV);
+ ExprsToRewrite.push_back(K);
+ }
+
// Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of
// the replacement expressions are contained in the ranges of the replaced
// expressions.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c47a1c1..0426ac7 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1353,9 +1353,9 @@ TargetTransformInfo::getInlineCallPenalty(const Function *F,
return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty);
}
-bool TargetTransformInfo::areTypesABICompatible(
- const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Types) const {
+bool TargetTransformInfo::areTypesABICompatible(const Function *Caller,
+ const Function *Callee,
+ ArrayRef<Type *> Types) const {
return TTIImpl->areTypesABICompatible(Caller, Callee, Types);
}
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 55fa2df..a6c7e6a 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -1076,10 +1076,3 @@ StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
LNStandardString;
StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
IndexString;
-
-constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Tag>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[];
diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
index 3de3dcc..80b421d 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
@@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> {
static void inputOne(IO &IO, StringRef Key, MapDocNode &M) {
ScalarDocNode KeyObj = M.getDocument()->getNode();
KeyObj.fromString(Key, "");
- IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]);
+ IO.mapRequired(Key, M.getMap()[KeyObj]);
}
static void output(IO &IO, MapDocNode &M) {
for (auto I : M.getMap()) {
- IO.mapRequired(I.first.toString().c_str(), I.second);
+ IO.mapRequired(I.first.toString(), I.second);
}
}
};
diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
index cc76063..2b6e2f0 100644
--- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
+++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
@@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> {
template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
HashNodeStable NodeStable;
- io.mapRequired(Key.str().c_str(), NodeStable);
+ io.mapRequired(Key, NodeStable);
unsigned Id;
if (Key.getAsInteger(0, Id)) {
io.setError("Id not an integer");
@@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
static void output(IO &io, IdHashNodeStableMapTy &V) {
for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
- io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+ io.mapRequired(utostr(Iter->first), Iter->second);
}
};
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 171fb83..98cdada 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -112,8 +112,7 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) {
/// to the first intersecting scope range if one exists.
static std::optional<ArrayRef<InsnRange>::iterator>
intersects(const MachineInstr *StartMI, const MachineInstr *EndMI,
- const ArrayRef<InsnRange> &Ranges,
- const InstructionOrdering &Ordering) {
+ ArrayRef<InsnRange> Ranges, const InstructionOrdering &Ordering) {
for (auto RangesI = Ranges.begin(), RangesE = Ranges.end();
RangesI != RangesE; ++RangesI) {
if (EndMI && Ordering.isBefore(EndMI, RangesI->first))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 555c56f..b16e1315 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1120,7 +1120,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
constructMemberDIE(Buffer, DDTy);
}
} else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
- DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
+ DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property);
StringRef PropertyName = Property->getName();
addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
if (Property->getType())
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8ea1326..0309e22 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -368,7 +368,7 @@ class CodeGenPrepare {
std::unique_ptr<DominatorTree> DT;
public:
- CodeGenPrepare(){};
+ CodeGenPrepare() = default;
CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
/// If encounter huge function, we need to limit the build time.
bool IsHugeFunc = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9ace7d6..ec4d13f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -589,8 +589,8 @@ bool CombinerHelper::matchCombineShuffleVector(
return true;
}
-void CombinerHelper::applyCombineShuffleVector(
- MachineInstr &MI, const ArrayRef<Register> Ops) const {
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+ ArrayRef<Register> Ops) const {
Register DstReg = MI.getOperand(0).getReg();
Builder.setInsertPt(*MI.getParent(), MI);
Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index bb9c76f..8c6d219 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -363,8 +363,9 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
case MachineOperand::MO_RegisterMask:
case MachineOperand::MO_RegisterLiveOut: {
// Shallow compare of the two RegMasks
- const uint32_t *RegMask = getRegMask();
- const uint32_t *OtherRegMask = Other.getRegMask();
+ const uint32_t *RegMask = isRegMask() ? getRegMask() : getRegLiveOut();
+ const uint32_t *OtherRegMask =
+ isRegMask() ? Other.getRegMask() : Other.getRegLiveOut();
if (RegMask == OtherRegMask)
return true;
@@ -434,7 +435,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
if (const MachineFunction *MF = getMFIfAvailable(MO)) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
- const uint32_t *RegMask = MO.getRegMask();
+ const uint32_t *RegMask =
+ MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize);
return hash_combine(MO.getType(), MO.getTargetFlags(),
stable_hash_combine(RegMaskHashes));
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 3ed1045..f18c051 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -334,7 +334,7 @@ public:
LiveIntervals &LIS;
};
- MachineSchedulerImpl() {}
+ MachineSchedulerImpl() = default;
// Migration only
void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
@@ -358,7 +358,7 @@ public:
MachineLoopInfo &MLI;
AAResults &AA;
};
- PostMachineSchedulerImpl() {}
+ PostMachineSchedulerImpl() = default;
// Migration only
void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 9d56696..6da708d 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -136,7 +136,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
unsigned RegMaskSize =
MachineOperand::getRegMaskSize(TRI->getNumRegs());
- const uint32_t *RegMask = MO.getRegMask();
+ const uint32_t *RegMask =
+ MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
std::vector<llvm::stable_hash> RegMaskHashes(RegMask,
RegMask + RegMaskSize);
return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 697b779..ec6ffd4 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -206,7 +206,7 @@ private:
bool Error = false; ///< Could not allocate.
explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {}
- explicit LiveReg() {}
+ explicit LiveReg() = default;
unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
};
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index e17a214b..38f6deb 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -378,7 +378,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
public:
// For legacy pass only.
- RegisterCoalescer() {}
+ RegisterCoalescer() = default;
RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default;
RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 893556b..46c4bb8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9374,7 +9374,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
// Check if the bytes offsets we are looking at match with either big or
// little endian value loaded. Return true for big endian, false for little
// endian, and std::nullopt if match failed.
-static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
int64_t FirstOffset) {
// The endian can be decided only when it is 2 bytes at least.
unsigned Width = ByteOffsets.size();
@@ -22760,7 +22760,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL);
PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
} else {
- NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx);
+ // The original DAG loaded the entire vector from memory, so arithmetic
+ // within it must be inbounds.
+ NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(),
+ Idx);
}
return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe..58983cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
case ISD::FAKE_USE:
Res = SoftenFloatOp_FAKE_USE(N);
break;
+ case ISD::STACKMAP:
+ Res = SoftenFloatOp_STACKMAP(N, OpNo);
+ break;
+ case ISD::PATCHPOINT:
+ Res = SoftenFloatOp_PATCHPOINT(N, OpNo);
+ break;
}
// If the result is null, the sub-method took care of registering results etc.
@@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) {
N->getOperand(0), Op1);
}
+SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) {
+ assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+ SmallVector<SDValue> NewOps(N->ops());
+ NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+ assert(OpNo >= 7);
+ SmallVector<SDValue> NewOps(N->ops());
+ NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
//===----------------------------------------------------------------------===//
// Float Result Expansion
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1776ea..44e5a18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
SmallVector<SDValue> NewOps(N->ops());
- SDValue Operand = N->getOperand(OpNo);
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
- NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+ NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
assert(OpNo >= 7);
SmallVector<SDValue> NewOps(N->ops());
- SDValue Operand = N->getOperand(OpNo);
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
- NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+ NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9656a30..ede522e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -658,6 +658,8 @@ private:
SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
SDValue SoftenFloatOp_FAKE_USE(SDNode *N);
+ SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo);
+ SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo);
//===--------------------------------------------------------------------===//
// Float Expansion Support: LegalizeFloatTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da4e409..9bdf822 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
DAG.getConstant(MaxIndex, dl, IdxVT));
}
-SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
- SDValue VecPtr, EVT VecVT,
- SDValue Index) const {
+SDValue
+TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
+ EVT VecVT, SDValue Index,
+ const SDNodeFlags PtrArithFlags) const {
return getVectorSubVecPointer(
DAG, VecPtr, VecVT,
EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
- Index);
+ Index, PtrArithFlags);
}
-SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
- SDValue VecPtr, EVT VecVT,
- EVT SubVecVT,
- SDValue Index) const {
+SDValue
+TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr,
+ EVT VecVT, EVT SubVecVT, SDValue Index,
+ const SDNodeFlags PtrArithFlags) const {
SDLoc dl(Index);
// Make sure the index type is big enough to compute in.
Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
@@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
DAG.getConstant(EltSize, dl, IdxVT));
- return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
+ return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags);
}
//===----------------------------------------------------------------------===//
@@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
!IsFast)
return SDValue();
- SDValue NewPtr =
- getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
+ // The original DAG loaded the entire vector from memory, so arithmetic
+ // within it must be inbounds.
+ SDValue NewPtr = getInboundsVectorElementPointer(
+ DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
// We are replacing a vector load with a scalar load. The new load must have
// identical memory op ordering to the original.
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f26..92b7fad 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const {
dbgs() << '\n';
}
#endif
-
-const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo;
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 6c23ba8..23ab534 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
return std::nullopt;
}
- assert(contains(Index));
+ if (!contains(Index))
+ return std::nullopt;
return Records[Index.toArrayIndex()].Type;
}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 9275586..ca8192b 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess
ExecutorSharedMemoryMapperService.cpp
DefaultHostBootstrapValues.cpp
ExecutorResolver.cpp
+ LibraryResolver.cpp
JITLoaderGDB.cpp
JITLoaderPerf.cpp
JITLoaderVTune.cpp
+ LibraryScanner.cpp
OrcRTBootstrap.cpp
RegisterEHFrames.cpp
SimpleExecutorDylibManager.cpp
@@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess
LINK_COMPONENTS
${intel_jit_profiling}
+ BinaryFormat
+ Object
OrcShared
Support
TargetParser
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
new file mode 100644
index 0000000..35da82a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -0,0 +1,370 @@
+//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Library resolution impl for unresolved symbols
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+
+#include "llvm/ADT/StringSet.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+#include <thread>
+
+#define DEBUG_TYPE "orc-resolver"
+
+namespace llvm::orc {
+
+LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S)
+ : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()),
+ LibPathResolver(S.PResolver
+ ? S.PResolver
+ : std::make_shared<PathResolver>(LibPathCache)),
+ ScanHelper(S.BasePaths, LibPathCache, LibPathResolver),
+ FB(S.FilterBuilder), LibMgr(),
+ ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall
+ : [](StringRef) -> bool { return true; }),
+ scanBatchSize(S.ScanBatchSize) {
+
+ if (ScanHelper.getAllUnits().empty()) {
+ LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n");
+ }
+}
+
+std::unique_ptr<LibraryResolutionDriver>
+LibraryResolutionDriver::create(const LibraryResolver::Setup &S) {
+ auto LR = std::make_unique<LibraryResolver>(S);
+ return std::unique_ptr<LibraryResolutionDriver>(
+ new LibraryResolutionDriver(std::move(LR)));
+}
+
+void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) {
+ LR->ScanHelper.addBasePath(Path, K);
+}
+
+bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) {
+ auto Lib = LR->LibMgr.getLibrary(Path);
+ if (!Lib)
+ return false;
+
+ Lib->setState(LibraryManager::LibState::Loaded);
+
+ return true;
+}
+
+bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) {
+ auto Lib = LR->LibMgr.getLibrary(Path);
+ if (!Lib)
+ return false;
+
+ Lib->setState(LibraryManager::LibState::Unloaded);
+
+ return true;
+}
+
+void LibraryResolutionDriver::resolveSymbols(
+ std::vector<std::string> Syms,
+ LibraryResolver::OnSearchComplete OnCompletion,
+ const SearchConfig &Config) {
+ LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config);
+}
+
+static bool shouldIgnoreSymbol(const object::SymbolRef &Sym,
+ uint32_t IgnoreFlags) {
+ Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+ if (!FlagsOrErr) {
+ consumeError(FlagsOrErr.takeError());
+ return true;
+ }
+
+ uint32_t Flags = *FlagsOrErr;
+
+ using Filter = SymbolEnumeratorOptions;
+ if ((IgnoreFlags & Filter::IgnoreUndefined) &&
+ (Flags & object::SymbolRef::SF_Undefined))
+ return true;
+ if ((IgnoreFlags & Filter::IgnoreIndirect) &&
+ (Flags & object::SymbolRef::SF_Indirect))
+ return true;
+ if ((IgnoreFlags & Filter::IgnoreWeak) &&
+ (Flags & object::SymbolRef::SF_Weak))
+ return true;
+
+ return false;
+}
+
+bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+ const SymbolEnumeratorOptions &Opts) {
+ if (Path.empty())
+ return false;
+
+ ObjectFileLoader ObjLoader(Path);
+
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ std::string ErrMsg;
+ handleAllErrors(ObjOrErr.takeError(),
+ [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+ LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path
+ << "\nError: " << ErrMsg << "\n");
+ return false;
+ }
+
+ object::ObjectFile *Obj = &ObjOrErr.get();
+
+ auto processSymbolRange =
+ [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult {
+ for (const auto &Sym : Range) {
+ if (shouldIgnoreSymbol(Sym, Opts.FilterFlags))
+ continue;
+
+ auto NameOrErr = Sym.getName();
+ if (!NameOrErr) {
+ consumeError(NameOrErr.takeError());
+ continue;
+ }
+
+ StringRef Name = *NameOrErr;
+ if (Name.empty())
+ continue;
+
+ EnumerateResult Res = OnEach(Name);
+ if (Res != EnumerateResult::Continue)
+ return Res;
+ }
+ return EnumerateResult::Continue;
+ };
+
+ EnumerateResult Res = processSymbolRange(Obj->symbols());
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+
+ if (Obj->isELF()) {
+ const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj);
+ Res = processSymbolRange(ElfObj->getDynamicSymbolIterators());
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+ } else if (Obj->isCOFF()) {
+ const auto *CoffObj = cast<object::COFFObjectFile>(Obj);
+ for (auto I = CoffObj->export_directory_begin(),
+ E = CoffObj->export_directory_end();
+ I != E; ++I) {
+ StringRef Name;
+ if (I->getSymbolName(Name))
+ continue;
+ if (Name.empty())
+ continue;
+
+ EnumerateResult Res = OnEach(Name);
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+ }
+ } else if (Obj->isMachO()) {
+ }
+
+ return true;
+}
+
+class SymbolSearchContext {
+public:
+ SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
+
+ bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+
+ void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+
+ inline bool allResolved() const { return Q.allResolved(); }
+
+ SymbolQuery &query() { return Q; }
+
+private:
+ SymbolQuery &Q;
+ DenseSet<LibraryInfo *> Searched;
+};
+
+void LibraryResolver::resolveSymbolsInLibrary(
+ LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols,
+ const SymbolEnumeratorOptions &Opts) {
+ LLVM_DEBUG(dbgs() << "Checking unresolved symbols "
+ << " in library : " << Lib.getFileName() << "\n";);
+ StringSet<> DiscoveredSymbols;
+
+ if (!UnresolvedSymbols.hasUnresolved()) {
+ LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath()
+ << " — unresolved symbols exist.\n";);
+ return;
+ }
+
+ bool HasEnumerated = false;
+ auto enumerateSymbolsIfNeeded = [&]() {
+ if (HasEnumerated)
+ return;
+
+ HasEnumerated = true;
+
+ LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath()
+ << "\n";);
+ SymbolEnumerator::enumerateSymbols(
+ Lib.getFullPath(),
+ [&](StringRef sym) {
+ DiscoveredSymbols.insert(sym);
+ return EnumerateResult::Continue;
+ },
+ Opts);
+
+ if (DiscoveredSymbols.empty()) {
+ LLVM_DEBUG(dbgs() << " No symbols and remove library : "
+ << Lib.getFullPath() << "\n";);
+ LibMgr.removeLibrary(Lib.getFullPath());
+ return;
+ }
+ };
+
+ if (!Lib.hasFilter()) {
+ LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+ << "\n";);
+ enumerateSymbolsIfNeeded();
+ SmallVector<StringRef> SymbolVec;
+ SymbolVec.reserve(DiscoveredSymbols.size());
+ for (const auto &KV : DiscoveredSymbols)
+ SymbolVec.push_back(KV.first());
+
+ Lib.ensureFilterBuilt(FB, SymbolVec);
+ LLVM_DEBUG({
+ dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n";
+ for (const auto &KV : DiscoveredSymbols)
+ dbgs() << "DiscoveredSymbols : " << KV.first() << "\n";
+ });
+ }
+
+ const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols();
+ bool HadAnySym = false;
+ LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size()
+ << "\n";);
+ for (const auto &Sym : Unresolved) {
+ if (Lib.mayContain(Sym)) {
+ LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym
+ << "' in library: " << Lib.getFullPath() << "\n";);
+ enumerateSymbolsIfNeeded();
+ if (DiscoveredSymbols.count(Sym) > 0) {
+ LLVM_DEBUG(dbgs() << " Resolved symbol: " << Sym
+ << " in library: " << Lib.getFullPath() << "\n";);
+ UnresolvedSymbols.resolve(Sym, Lib.getFullPath());
+ HadAnySym = true;
+ }
+ }
+ }
+
+ using LibraryState = LibraryManager::LibState;
+ if (HadAnySym && Lib.getState() != LibraryState::Loaded)
+ Lib.setState(LibraryState::Queried);
+}
+
+void LibraryResolver::searchSymbolsInLibraries(
+ std::vector<std::string> &SymbolList, OnSearchComplete OnComplete,
+ const SearchConfig &Config) {
+ SymbolQuery Q(SymbolList);
+
+ using LibraryState = LibraryManager::LibState;
+ using LibraryType = PathType;
+ auto tryResolveFrom = [&](LibraryState S, LibraryType K) {
+ LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S)
+ << " type=" << static_cast<int>(K) << "\n";);
+
+ SymbolSearchContext Ctx(Q);
+ while (!Ctx.allResolved()) {
+
+ for (auto &Lib : LibMgr.getView(S, K)) {
+ if (Ctx.hasSearched(Lib.get()))
+ continue;
+
+ // can use Async here?
+ resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
+ Ctx.markSearched(Lib.get());
+
+ if (Ctx.allResolved())
+ return;
+ }
+
+ if (Ctx.allResolved())
+ return;
+
+ if (!scanLibrariesIfNeeded(K, scanBatchSize))
+ break; // no more new libs to scan
+ }
+ };
+
+ for (const auto &[St, Ty] : Config.Policy.Plan) {
+ tryResolveFrom(St, Ty);
+ if (Q.allResolved())
+ break;
+ }
+
+ // done:
+ LLVM_DEBUG({
+ dbgs() << "Search complete.\n";
+ for (const auto &r : Q.getAllResults())
+ dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath
+ << "\n";
+ });
+
+ OnComplete(Q);
+}
+
+bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) {
+ LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for "
+ << (PK == PathType::User ? "User" : "System")
+ << " libraries\n";);
+ if (!ScanHelper.leftToScan(PK))
+ return false;
+
+ LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall);
+ Scanner.scanNext(PK, BatchSize);
+ return true;
+}
+
+bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib,
+ StringRef SymName,
+ std::vector<std::string> *AllSyms) {
+ SymbolEnumeratorOptions Opts;
+ return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts);
+}
+
+bool LibraryResolver::symbolExistsInLibrary(
+ const LibraryInfo &Lib, StringRef SymName,
+ std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) {
+ bool Found = false;
+
+ SymbolEnumerator::enumerateSymbols(
+ Lib.getFullPath(),
+ [&](StringRef Sym) {
+ if (AllSyms)
+ AllSyms->emplace_back(Sym.str());
+
+ if (Sym == SymName) {
+ Found = true;
+ }
+
+ return EnumerateResult::Continue;
+ },
+ Opts);
+
+ return Found;
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
new file mode 100644
index 0000000..d93f686
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -0,0 +1,1161 @@
+//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+
+#ifdef LLVM_ON_UNIX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // LLVM_ON_UNIX
+
+#ifdef __APPLE__
+#include <sys/stat.h>
+#undef LC_LOAD_DYLIB
+#undef LC_RPATH
+#endif // __APPLE__
+
+#define DEBUG_TYPE "orc-scanner"
+
+namespace llvm::orc {
+
+void handleError(Error Err, StringRef context = "") {
+ consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+ dbgs() << "LLVM Error";
+ if (!context.empty())
+ dbgs() << " [" << context << "]";
+ dbgs() << ": " << EIB.message() << "\n";
+ }));
+}
+
+bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
+ Triple HostTriple(sys::getDefaultTargetTriple());
+ Triple ObjTriple = Obj.makeTriple();
+
+ LLVM_DEBUG({
+ dbgs() << "Host triple: " << HostTriple.str()
+ << ", Object triple: " << ObjTriple.str() << "\n";
+ });
+
+ if (ObjTriple.getArch() != Triple::UnknownArch &&
+ HostTriple.getArch() != ObjTriple.getArch())
+ return false;
+
+ if (ObjTriple.getOS() != Triple::UnknownOS &&
+ HostTriple.getOS() != ObjTriple.getOS())
+ return false;
+
+ if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment &&
+ HostTriple.getEnvironment() != Triple::UnknownEnvironment &&
+ HostTriple.getEnvironment() != ObjTriple.getEnvironment())
+ return false;
+
+ return true;
+}
+
+Expected<object::OwningBinary<object::ObjectFile>>
+ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath
+ << "\n";);
+ auto BinOrErr = object::createBinary(FilePath);
+ if (!BinOrErr) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath
+ << "\n";);
+ return BinOrErr.takeError();
+ }
+
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath
+ << "\n";);
+
+ auto OwningBin = BinOrErr->takeBinary();
+ object::Binary *Bin = OwningBin.first.get();
+
+ if (Bin->isArchive()) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: "
+ << FilePath << "\n";);
+ return createStringError(std::errc::invalid_argument,
+ "Archive files are not supported: %s",
+ FilePath.str().c_str());
+ }
+
+#if defined(__APPLE__)
+ if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: "
+ << FilePath << "\n";);
+ for (auto ObjForArch : UB->objects()) {
+ auto ObjOrErr = ObjForArch.getAsObjectFile();
+ if (!ObjOrErr) {
+ LLVM_DEBUG(
+ dbgs()
+ << "ObjectFileLoader: Skipping invalid architecture slice\n";);
+
+ consumeError(ObjOrErr.takeError());
+ continue;
+ }
+
+ std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get());
+ if (isArchitectureCompatible(*Obj)) {
+ LLVM_DEBUG(
+ dbgs() << "ObjectFileLoader: Found compatible object slice\n";);
+
+ return object::OwningBinary<object::ObjectFile>(
+ std::move(Obj), std::move(OwningBin.second));
+
+ } else {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture "
+ "slice skipped\n";);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in "
+ "universal binary\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "No compatible object found in fat binary: %s",
+ FilePath.str().c_str());
+ }
+#endif
+
+ auto ObjOrErr =
+ object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef());
+ if (!ObjOrErr) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";);
+ return ObjOrErr.takeError();
+ }
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";);
+
+ std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr);
+ if (!isArchitectureCompatible(*Obj)) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: "
+ << FilePath << "\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "Incompatible object file: %s",
+ FilePath.str().c_str());
+ }
+
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";);
+
+ return object::OwningBinary<object::ObjectFile>(std::move(Obj),
+ std::move(OwningBin.second));
+}
+
+template <class ELFT>
+bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) {
+ if (ELFObj.getHeader().e_type != ELF::ET_DYN)
+ return false;
+
+ auto PHOrErr = ELFObj.program_headers();
+ if (!PHOrErr) {
+ consumeError(PHOrErr.takeError());
+ return true;
+ }
+
+ for (auto Phdr : *PHOrErr) {
+ if (Phdr.p_type == ELF::PT_INTERP)
+ return false;
+ }
+
+ return true;
+}
+
+bool isSharedLibraryObject(object::ObjectFile &Obj) {
+ if (Obj.isELF()) {
+ if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF32LE->getELFFile());
+ if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF64LE->getELFFile());
+ if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF32BE->getELFFile());
+ if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF64BE->getELFFile());
+ } else if (Obj.isMachO()) {
+ const object::MachOObjectFile *MachO =
+ dyn_cast<object::MachOObjectFile>(&Obj);
+ if (!MachO) {
+ LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";);
+ return false;
+ }
+ LLVM_DEBUG({
+ bool Result =
+ MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+ dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype
+ << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB
+ << "), shared: " << Result << "\n";
+ });
+
+ return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+ } else if (Obj.isCOFF()) {
+ const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj);
+ if (!coff)
+ return false;
+ return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL;
+ } else {
+ LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";);
+ }
+
+ return false;
+}
+
+bool DylibPathValidator::isSharedLibrary(StringRef Path) {
+ LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path
+ << "\n";);
+
+ auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true);
+ if (FileType != sys::fs::file_type::regular_file) {
+ LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path
+ << "\n";);
+ return false;
+ }
+
+ file_magic MagicCode;
+ identify_magic(Path, MagicCode);
+
+ // Skip archives.
+ if (MagicCode == file_magic::archive)
+ return false;
+
+ // Universal binary handling.
+#if defined(__APPLE__)
+ if (MagicCode == file_magic::macho_universal_binary) {
+ ObjectFileLoader ObjLoader(Path);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ consumeError(ObjOrErr.takeError());
+ return false;
+ }
+ return isSharedLibraryObject(ObjOrErr.get());
+ }
+#endif
+
+ // Object file inspection for PE/COFF, ELF, and Mach-O
+ bool NeedsObjectInspection =
+#if defined(_WIN32)
+ (MagicCode == file_magic::pecoff_executable);
+#elif defined(__APPLE__)
+ (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib ||
+ MagicCode == file_magic::macho_dynamically_linked_shared_lib ||
+ MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub);
+#elif defined(LLVM_ON_UNIX)
+#ifdef __CYGWIN__
+ (MagicCode == file_magic::pecoff_executable);
+#else
+ (MagicCode == file_magic::elf_shared_object);
+#endif
+#else
+#error "Unsupported platform."
+#endif
+
+ if (NeedsObjectInspection) {
+ ObjectFileLoader ObjLoader(Path);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ consumeError(ObjOrErr.takeError());
+ return false;
+ }
+ return isSharedLibraryObject(ObjOrErr.get());
+ }
+
+ LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path
+ << "\n";);
+ return false;
+}
+
+void DylibSubstitutor::configure(StringRef LoaderPath) {
+ SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+ sys::path::remove_filename(ExecPath);
+
+ SmallString<512> LoaderDir;
+ if (LoaderPath.empty()) {
+ LoaderDir = ExecPath;
+ } else {
+ LoaderDir = LoaderPath.str();
+ if (!sys::fs::is_directory(LoaderPath))
+ sys::path::remove_filename(LoaderDir);
+ }
+
+#ifdef __APPLE__
+ Placeholders["@loader_path"] = std::string(LoaderDir);
+ Placeholders["@executable_path"] = std::string(ExecPath);
+#else
+ Placeholders["$origin"] = std::string(LoaderDir);
+#endif
+}
+
+std::optional<std::string>
+SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst,
+ DylibPathValidator &Validator) const {
+ for (const auto &SP : Paths) {
+ std::string Base = Subst.substitute(SP);
+
+ SmallString<512> FullPath(Base);
+ if (!PlaceholderPrefix.empty() &&
+ Stem.starts_with_insensitive(PlaceholderPrefix))
+ FullPath.append(Stem.drop_front(PlaceholderPrefix.size()));
+ else
+ sys::path::append(FullPath, Stem);
+
+ LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath
+ << "\n";);
+
+ if (auto Valid = Validator.validate(FullPath.str()))
+ return Valid;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::tryWithExtensions(StringRef LibStem) const {
+ LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";);
+ SmallVector<SmallString<256>, 8> Candidates;
+
+ // Add extensions by platform
+#if defined(__APPLE__)
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".dylib";
+#elif defined(_WIN32)
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".dll";
+#else
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".so";
+#endif
+
+ // Optionally try "lib" prefix if not already there
+ StringRef FileName = sys::path::filename(LibStem);
+ StringRef Base = sys::path::parent_path(LibStem);
+ if (!FileName.starts_with("lib")) {
+ SmallString<256> WithPrefix(Base);
+ if (!WithPrefix.empty())
+ sys::path::append(WithPrefix, ""); // ensure separator if needed
+ WithPrefix += "lib";
+ WithPrefix += FileName;
+
+#if defined(__APPLE__)
+ WithPrefix += ".dylib";
+#elif defined(_WIN32)
+ WithPrefix += ".dll";
+#else
+ WithPrefix += ".so";
+#endif
+
+ Candidates.push_back(std::move(WithPrefix));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << " Candidates to try:\n";
+ for (const auto &C : Candidates)
+ dbgs() << " " << C << "\n";
+ });
+
+ // Try all variants using tryAllPaths
+ for (const auto &Name : Candidates) {
+
+ LLVM_DEBUG(dbgs() << " Trying candidate: " << Name << "\n";);
+
+ for (const auto &R : Resolvers) {
+ if (auto Res = R.resolve(Name, Substitutor, Validator))
+ return Res;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " -> No candidate Resolved.\n";);
+
+ return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const {
+ LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";);
+
+ // If it is an absolute path, don't try iterate over the paths.
+ if (sys::path::is_absolute(LibStem)) {
+ LLVM_DEBUG(dbgs() << " -> Absolute path detected.\n";);
+ return Validator.validate(LibStem);
+ }
+
+ if (!LibStem.starts_with_insensitive("@rpath")) {
+ if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) {
+ LLVM_DEBUG(dbgs() << " -> Resolved after substitution: " << *norm
+ << "\n";);
+
+ return norm;
+ }
+ }
+
+ for (const auto &R : Resolvers) {
+ LLVM_DEBUG(dbgs() << " -> Resolving via search path ... \n";);
+ if (auto Result = R.resolve(LibStem, Substitutor, Validator)) {
+ LLVM_DEBUG(dbgs() << " -> Resolved via search path: " << *Result
+ << "\n";);
+
+ return Result;
+ }
+ }
+
+ // Expand libStem with paths, extensions, etc.
+ // std::string foundName;
+ if (VariateLibStem) {
+ LLVM_DEBUG(dbgs() << " -> Trying with extensions...\n";);
+
+ if (auto Norm = tryWithExtensions(LibStem)) {
+ LLVM_DEBUG(dbgs() << " -> Resolved via tryWithExtensions: " << *Norm
+ << "\n";);
+
+ return Norm;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " -> Could not resolve: " << LibStem << "\n";);
+
+ return std::nullopt;
+}
+
+#ifndef _WIN32
+mode_t PathResolver::lstatCached(StringRef Path) {
+ // If already cached - retun cached result
+ if (auto Cache = LibPathCache->read_lstat(Path))
+ return *Cache;
+
+ // Not cached: perform lstat and store
+ struct stat buf{};
+ mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode;
+
+ LibPathCache->insert_lstat(Path, st_mode);
+
+ return st_mode;
+}
+
+std::optional<std::string> PathResolver::readlinkCached(StringRef Path) {
+ // If already cached - retun cached result
+ if (auto Cache = LibPathCache->read_link(Path))
+ return Cache;
+
+ // If result not in cache - call system function and cache result
+ char buf[PATH_MAX];
+ ssize_t len;
+ if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) {
+ buf[len] = '\0';
+ std::string s(buf);
+ LibPathCache->insert_link(Path, s);
+ return s;
+ }
+ return std::nullopt;
+}
+
+void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved,
+ SmallVector<StringRef, 16> &Component) {
+ StringRef Separator = sys::path::get_separator();
+ if (!BaseIsResolved) {
+ if (Path[0] == '~' &&
+ (Path.size() == 1 || sys::path::is_separator(Path[1]))) {
+ static SmallString<128> HomeP;
+ if (HomeP.str().empty())
+ sys::path::home_directory(HomeP);
+ StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1,
+ /*KeepEmpty*/ false);
+ } else if (BasePath.empty()) {
+ static SmallString<256> CurrentPath;
+ if (CurrentPath.str().empty())
+ sys::fs::current_path(CurrentPath);
+ StringRef(CurrentPath)
+ .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+ } else {
+ BasePath.split(Component, Separator, /*MaxSplit*/ -1,
+ /*KeepEmpty*/ false);
+ }
+ }
+
+ Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+}
+
+void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) {
+ SmallVector<StringRef, 16> NormalizedPath;
+ for (auto &Part : PathParts) {
+ if (Part == ".") {
+ continue;
+ } else if (Part == "..") {
+ if (!NormalizedPath.empty() && NormalizedPath.back() != "..") {
+ NormalizedPath.pop_back();
+ } else {
+ NormalizedPath.push_back("..");
+ }
+ } else {
+ NormalizedPath.push_back(Part);
+ }
+ }
+ PathParts.swap(NormalizedPath);
+}
+#endif
+
+std::optional<std::string> PathResolver::realpathCached(StringRef Path,
+ std::error_code &EC,
+ StringRef Base,
+ bool BaseIsResolved,
+ long SymLoopLevel) {
+ EC.clear();
+
+ if (Path.empty()) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";);
+
+ return std::nullopt;
+ }
+
+ if (SymLoopLevel <= 0) {
+ EC = std::make_error_code(std::errc::too_many_symbolic_link_levels);
+ LLVM_DEBUG(
+ dbgs() << "PathResolver::realpathCached: Too many Symlink levels: "
+ << Path << "\n";);
+
+ return std::nullopt;
+ }
+
+ // If already cached - retun cached result
+ bool isRelative = sys::path::is_relative(Path);
+ if (!isRelative) {
+ if (auto Cached = LibPathCache->read_realpath(Path)) {
+ EC = Cached->ErrnoCode;
+ if (EC) {
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for "
+ << Path << "\n";);
+ } else {
+ LLVM_DEBUG(
+ dbgs() << "PathResolver::realpathCached: Cached (success) for "
+ << Path << " => " << Cached->canonicalPath << "\n";);
+ }
+ return Cached->canonicalPath.empty()
+ ? std::nullopt
+ : std::make_optional(Cached->canonicalPath);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path
+ << "\n";);
+
+ // If result not in cache - call system function and cache result
+
+ StringRef Separator(sys::path::get_separator());
+ SmallString<256> Resolved(Separator);
+#ifndef _WIN32
+ SmallVector<StringRef, 16> Components;
+
+ if (isRelative) {
+ if (BaseIsResolved) {
+ Resolved.assign(Base);
+ LLVM_DEBUG(dbgs() << " Using Resolved base: " << Base << "\n";);
+ }
+ createComponent(Path, Base, BaseIsResolved, Components);
+ } else {
+ Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+ }
+
+ normalizePathSegments(Components);
+ LLVM_DEBUG({
+ for (auto &C : Components)
+ dbgs() << " " << C << " ";
+
+ dbgs() << "\n";
+ });
+
+ // Handle path list items
+ for (const auto &Component : Components) {
+ if (Component == ".")
+ continue;
+ if (Component == "..") {
+ // collapse "a/b/../c" to "a/c"
+ size_t S = Resolved.rfind(Separator);
+ if (S != llvm::StringRef::npos)
+ Resolved.resize(S);
+ if (Resolved.empty())
+ Resolved = Separator;
+ continue;
+ }
+
+ size_t oldSize = Resolved.size();
+ sys::path::append(Resolved, Component);
+ const char *ResolvedPath = Resolved.c_str();
+ LLVM_DEBUG(dbgs() << " Processing Component: " << Component << " => "
+ << ResolvedPath << "\n";);
+ mode_t st_mode = lstatCached(ResolvedPath);
+
+ if (S_ISLNK(st_mode)) {
+ LLVM_DEBUG(dbgs() << " Found symlink: " << ResolvedPath << "\n";);
+
+ auto SymlinkOpt = readlinkCached(ResolvedPath);
+ if (!SymlinkOpt) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Failed to read symlink: " << ResolvedPath
+ << "\n";);
+
+ return std::nullopt;
+ }
+
+ StringRef Symlink = *SymlinkOpt;
+ LLVM_DEBUG(dbgs() << " Symlink points to: " << Symlink << "\n";);
+
+ std::string resolvedBase = "";
+ if (sys::path::is_relative(Symlink)) {
+ Resolved.resize(oldSize);
+ resolvedBase = Resolved.str().str();
+ }
+
+ auto RealSymlink =
+ realpathCached(Symlink, EC, resolvedBase,
+ /*BaseIsResolved=*/true, SymLoopLevel - 1);
+ if (!RealSymlink) {
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Failed to resolve symlink target: " << Symlink
+ << "\n";);
+
+ return std::nullopt;
+ }
+
+ Resolved.assign(*RealSymlink);
+ LLVM_DEBUG(dbgs() << " Symlink Resolved to: " << Resolved << "\n";);
+
+ } else if (st_mode == 0) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Component does not exist: " << ResolvedPath
+ << "\n";);
+
+ return std::nullopt;
+ }
+ }
+#else
+ EC = sys::fs::real_path(Path, Resolved); // Windows fallback
+#endif
+
+ std::string Canonical = Resolved.str().str();
+ {
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{
+ Canonical,
+ std::error_code() // success
+ });
+ }
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path
+ << " => " << Canonical << "\n";);
+ return Canonical;
+}
+
+void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) {
+ std::error_code EC;
+ std::string Canon = resolveCanonical(Path, EC);
+ if (EC) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LibraryScanHelper::addBasePath: Failed to canonicalize path: "
+ << Path << "\n";);
+ return;
+ }
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (LibSearchPaths.count(Canon)) {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: "
+ << Canon << "\n";);
+ return;
+ }
+ K = K == PathType::Unknown ? classifyKind(Canon) : K;
+ auto SP = std::make_shared<LibrarySearchPath>(Canon, K);
+ LibSearchPaths[Canon] = SP;
+
+ if (K == PathType::User) {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: "
+ << Canon << "\n";);
+ UnscannedUsr.push_back(StringRef(SP->BasePath));
+ } else {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: "
+ << Canon << "\n";);
+ UnscannedSys.push_back(StringRef(SP->BasePath));
+ }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) {
+ std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+ auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys;
+
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+
+ while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) {
+ StringRef Base = Queue.front();
+ auto It = LibSearchPaths.find(Base);
+ if (It != LibSearchPaths.end()) {
+ auto &SP = It->second;
+ ScanState Expected = ScanState::NotScanned;
+ if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) {
+ Result.push_back(SP);
+ }
+ }
+ Queue.pop_front();
+ }
+
+ return Result;
+}
+
+bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const {
+ std::error_code EC;
+ std::string Canon = resolveCanonical(Path, EC);
+ if (EC)
+ return false;
+
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ return LibSearchPaths.count(Canon) > 0;
+}
+
+bool LibraryScanHelper::leftToScan(PathType K) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ for (const auto &KV : LibSearchPaths) {
+ const auto &SP = KV.second;
+ if (SP->Kind == K && SP->State == ScanState::NotScanned)
+ return true;
+ }
+ return false;
+}
+
+void LibraryScanHelper::resetToScan() {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+
+ for (auto &[_, SP] : LibSearchPaths) {
+ ScanState Expected = ScanState::Scanned;
+
+ if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned))
+ continue;
+
+ auto &TargetList =
+ (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys;
+ TargetList.emplace_back(SP->BasePath);
+ }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getAllUnits() const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+ Result.reserve(LibSearchPaths.size());
+ for (const auto &[_, SP] : LibSearchPaths) {
+ Result.push_back(SP);
+ }
+ return Result;
+}
+
+std::string LibraryScanHelper::resolveCanonical(StringRef Path,
+ std::error_code &EC) const {
+ auto Canon = LibPathResolver->resolve(Path, EC);
+ return EC ? Path.str() : *Canon;
+}
+
+PathType LibraryScanHelper::classifyKind(StringRef Path) const {
+ // Detect home directory
+ const char *Home = getenv("HOME");
+ if (Home && Path.find(Home) == 0)
+ return PathType::User;
+
+ static const std::array<std::string, 5> UserPrefixes = {
+ "/usr/local", // often used by users for manual installs
+ "/opt/homebrew", // common on macOS
+ "/opt/local", // MacPorts
+ "/home", // Linux home dirs
+ "/Users", // macOS user dirs
+ };
+
+ for (const auto &Prefix : UserPrefixes) {
+ if (Path.find(Prefix) == 0)
+ return PathType::User;
+ }
+
+ return PathType::System;
+}
+
+Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) {
+ LibraryDepsInfo Libdeps;
+ LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";);
+ for (const auto &Command : Obj.load_commands()) {
+ switch (Command.C.cmd) {
+ case MachO::LC_LOAD_DYLIB: {
+ MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command);
+ const char *name = Command.Ptr + dylibCmd.dylib.name;
+ Libdeps.addDep(name);
+ LLVM_DEBUG(dbgs() << " Found LC_LOAD_DYLIB: " << name << "\n";);
+ } break;
+ case MachO::LC_LOAD_WEAK_DYLIB:
+ case MachO::LC_REEXPORT_DYLIB:
+ case MachO::LC_LOAD_UPWARD_DYLIB:
+ case MachO::LC_LAZY_LOAD_DYLIB:
+ break;
+ case MachO::LC_RPATH: {
+ // Extract RPATH
+ MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command);
+ const char *rpath = Command.Ptr + rpathCmd.path;
+ LLVM_DEBUG(dbgs() << " Found LC_RPATH: " << rpath << "\n";);
+
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(StringRef(rpath), RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+
+ for (const auto &raw : RawPaths) {
+ Libdeps.addRPath(raw.str()); // Convert to std::string
+ LLVM_DEBUG(dbgs() << " Parsed RPATH entry: " << raw << "\n";);
+ }
+ break;
+ }
+ }
+ }
+
+ return Expected<LibraryDepsInfo>(std::move(Libdeps));
+}
+
+template <class ELFT>
+static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) {
+ auto DynamicEntriesOrError = Elf.dynamicEntries();
+ if (!DynamicEntriesOrError)
+ return DynamicEntriesOrError.takeError();
+
+ for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+ if (Dyn.d_tag == ELF::DT_STRTAB) {
+ auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
+ if (!MappedAddrOrError)
+ return MappedAddrOrError.takeError();
+ return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+ }
+ }
+
+ // If the dynamic segment is not present, we fall back on the sections.
+ auto SectionsOrError = Elf.sections();
+ if (!SectionsOrError)
+ return SectionsOrError.takeError();
+
+ for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
+ if (Sec.sh_type == ELF::SHT_DYNSYM)
+ return Elf.getStringTableForSymtab(Sec);
+ }
+
+ return make_error<StringError>("dynamic string table not found",
+ inconvertibleErrorCode());
+}
+
+template <typename ELFT>
+Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) {
+ LibraryDepsInfo Deps;
+ Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+ if (!StrTabOrErr)
+ return StrTabOrErr.takeError();
+
+ const char *Data = StrTabOrErr->data();
+
+ auto DynamicEntriesOrError = Elf.dynamicEntries();
+ if (!DynamicEntriesOrError) {
+ return DynamicEntriesOrError.takeError();
+ }
+
+ for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+ switch (Dyn.d_tag) {
+ case ELF::DT_NEEDED:
+ Deps.addDep(Data + Dyn.d_un.d_val);
+ break;
+ case ELF::DT_RPATH: {
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(Data + Dyn.d_un.d_val, RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+ for (const auto &raw : RawPaths)
+ Deps.addRPath(raw.str());
+ break;
+ }
+ case ELF::DT_RUNPATH: {
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(Data + Dyn.d_un.d_val, RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+ for (const auto &raw : RawPaths)
+ Deps.addRunPath(raw.str());
+ break;
+ }
+ case ELF::DT_FLAGS_1:
+ // Check if this is not a pie executable.
+ if (Dyn.d_un.d_val & ELF::DF_1_PIE)
+ Deps.isPIE = true;
+ break;
+ // (Dyn.d_tag == ELF::DT_NULL) continue;
+ // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER)
+ default:
+ break;
+ }
+ }
+
+ return Expected<LibraryDepsInfo>(std::move(Deps));
+}
+
+Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) {
+ using namespace object;
+ LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";);
+ if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+
+ LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";);
+ return createStringError(std::errc::not_supported, "Unknown ELF format");
+}
+
+Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) {
+ LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath
+ << "\n";);
+
+ ObjectFileLoader ObjLoader(FilePath);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";);
+ return ObjOrErr.takeError();
+ }
+
+ object::ObjectFile *Obj = &ObjOrErr.get();
+
+ if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) {
+ LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+ << " is an ELF object\n";);
+
+ return parseELFDeps(*elfObj);
+ }
+
+ if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) {
+ LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+ << " is a Mach-O object\n";);
+ return parseMachODeps(*macho);
+ }
+
+ if (Obj->isCOFF()) {
+ // TODO: COFF support
+ return LibraryDepsInfo();
+ }
+
+ LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file "
+ << FilePath << "\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "Unsupported binary format: %s",
+ FilePath.str().c_str());
+}
+
+std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) {
+ std::error_code EC;
+
+ LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";);
+
+ // [1] Check file existence early
+ if (!sys::fs::exists(FilePath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: file does not exist.\n";);
+
+ return std::nullopt;
+ }
+
+ // [2] Resolve to canonical path
+ auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC);
+ if (EC || !CanonicalPathOpt) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: failed to resolve path (EC="
+ << EC.message() << ").\n";);
+
+ return std::nullopt;
+ }
+
+ const std::string &CanonicalPath = *CanonicalPathOpt;
+ LLVM_DEBUG(dbgs() << " -> Canonical path: " << CanonicalPath << "\n");
+
+ // [3] Check if it's a directory — skip directories
+ if (sys::fs::is_directory(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: path is a directory.\n";);
+
+ return std::nullopt;
+ }
+
+ // [4] Skip if it's not a shared library.
+ if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: not a shared library.\n";);
+ return std::nullopt;
+ }
+
+ // [5] Skip if we've already seen this path (via cache)
+ if (ScanHelper.hasSeenOrMark(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: already seen.\n";);
+
+ return std::nullopt;
+ }
+
+ // [6] Already tracked in LibraryManager?
+ if (LibMgr.hasLibrary(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: already tracked by LibraryManager.\n";);
+
+ return std::nullopt;
+ }
+
+ // [7] Run user-defined hook (default: always true)
+ if (!ShouldScanCall(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: user-defined hook rejected.\n";);
+
+ return std::nullopt;
+ }
+
+ LLVM_DEBUG(dbgs() << " -> Accepted: ready to scan " << CanonicalPath
+ << "\n";);
+ return CanonicalPath;
+}
+
+void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) {
+ LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath
+ << ", level=" << level << "\n";);
+ auto CanonPathOpt = shouldScan(FilePath);
+ if (!CanonPathOpt) {
+ LLVM_DEBUG(dbgs() << " Skipped (shouldScan returned false): " << FilePath
+ << "\n";);
+
+ return;
+ }
+ const std::string CanonicalPath = *CanonPathOpt;
+
+ auto DepsOrErr = extractDeps(CanonicalPath);
+ if (!DepsOrErr) {
+ LLVM_DEBUG(dbgs() << " Failed to extract deps for: " << CanonicalPath
+ << "\n";);
+ handleError(DepsOrErr.takeError());
+ return;
+ }
+
+ LibraryDepsInfo &Deps = *DepsOrErr;
+
+ LLVM_DEBUG({
+ dbgs() << " Found deps : \n";
+ for (const auto &dep : Deps.deps)
+ dbgs() << " : " << dep << "\n";
+ dbgs() << " Found @rpath : " << Deps.rpath.size() << "\n";
+ for (const auto &r : Deps.rpath)
+ dbgs() << " : " << r << "\n";
+ dbgs() << " Found @runpath : \n";
+ for (const auto &r : Deps.runPath)
+ dbgs() << " : " << r << "\n";
+ });
+
+ if (Deps.isPIE && level == 0) {
+ LLVM_DEBUG(dbgs() << " Skipped PIE executable at top level: "
+ << CanonicalPath << "\n";);
+
+ return;
+ }
+
+ bool Added = LibMgr.addLibrary(CanonicalPath, K);
+ if (!Added) {
+ LLVM_DEBUG(dbgs() << " Already added: " << CanonicalPath << "\n";);
+ return;
+ }
+
+ // Heuristic 1: No RPATH/RUNPATH, skip deps
+ if (Deps.rpath.empty() && Deps.runPath.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): "
+ << CanonicalPath << "\n";);
+ return;
+ }
+
+ // Heuristic 2: All RPATH and RUNPATH already tracked
+ auto allTracked = [&](const auto &Paths) {
+ LLVM_DEBUG(dbgs() << " Checking : " << Paths.size() << "\n";);
+ return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) {
+ LLVM_DEBUG(dbgs() << " Checking isTrackedBasePath : " << P << "\n";);
+ return ScanHelper.isTrackedBasePath(
+ DylibResolver::resolvelinkerFlag(P, CanonicalPath));
+ });
+ };
+
+ if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): "
+ << CanonicalPath << "\n";);
+ return;
+ }
+
+ DylibPathValidator Validator(ScanHelper.getPathResolver());
+ DylibResolver Resolver(Validator);
+ Resolver.configure(CanonicalPath,
+ {{Deps.rpath, SearchPathType::RPath},
+ {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys},
+ {Deps.runPath, SearchPathType::RunPath}});
+ for (StringRef Dep : Deps.deps) {
+ LLVM_DEBUG(dbgs() << " Resolving dep: " << Dep << "\n";);
+ auto DepFullOpt = Resolver.resolve(Dep);
+ if (!DepFullOpt) {
+ LLVM_DEBUG(dbgs() << " Failed to resolve dep: " << Dep << "\n";);
+
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << " Resolved dep to: " << *DepFullOpt << "\n";);
+
+ handleLibrary(*DepFullOpt, K, level + 1);
+ }
+}
+
+void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) {
+ if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: "
+ << SP->BasePath << "\n";);
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: "
+ << SP->BasePath << "\n";);
+ std::error_code EC;
+
+ SP->State.store(ScanState::Scanning);
+
+ for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC;
+ It.increment(EC)) {
+ auto Entry = *It;
+ if (!Entry.status())
+ continue;
+
+ auto Status = *Entry.status();
+ if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) {
+ LLVM_DEBUG(dbgs() << " Found file: " << Entry.path() << "\n";);
+ // async support ?
+ handleLibrary(Entry.path(), SP->Kind);
+ }
+ }
+
+ SP->State.store(ScanState::Scanned);
+}
+
+void LibraryScanner::scanNext(PathType K, size_t BatchSize) {
+ LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size "
+ << BatchSize << " for kind "
+ << (K == PathType::User ? "User" : "System") << "\n";);
+
+ auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize);
+ for (auto &SP : SearchPaths) {
+ LLVM_DEBUG(dbgs() << " Scanning unit with basePath: " << SP->BasePath
+ << "\n";);
+
+ scanBaseDir(SP);
+ }
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h
index a08502e..5851cfc 100644
--- a/llvm/lib/FileCheck/FileCheckImpl.h
+++ b/llvm/lib/FileCheck/FileCheckImpl.h
@@ -528,7 +528,7 @@ public:
SMRange getRange() const { return Range; }
static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg,
- SMRange Range = std::nullopt) {
+ SMRange Range = {}) {
return make_error<ErrorDiagnostic>(
SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range);
}
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b838e36..58b7ddd 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -730,7 +730,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
// (arm|aarch64).neon.bfdot.*'.
Intrinsic::ID ID =
StringSwitch<Intrinsic::ID>(Name)
- .Cases("v2f32.v8i8", "v4f32.v16i8",
+ .Cases({"v2f32.v8i8", "v4f32.v16i8"},
IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot
: (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot)
.Default(Intrinsic::not_intrinsic);
@@ -1456,7 +1456,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
if (F->arg_size() == 1) {
Intrinsic::ID IID =
StringSwitch<Intrinsic::ID>(Name)
- .Cases("brev32", "brev64", Intrinsic::bitreverse)
+ .Cases({"brev32", "brev64"}, Intrinsic::bitreverse)
.Case("clz.i", Intrinsic::ctlz)
.Case("popc.i", Intrinsic::ctpop)
.Default(Intrinsic::not_intrinsic);
@@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
else if (Name.consume_front("fabs."))
// nvvm.fabs.{f,ftz.f,d}
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
+ else if (Name.consume_front("ex2.approx."))
+ // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+ Expand =
+ Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2";
else if (Name.consume_front("max.") || Name.consume_front("min."))
// nvvm.{min,max}.{i,ii,ui,ull}
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz
: Intrinsic::nvvm_fabs;
Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
+ } else if (Name.consume_front("ex2.approx.")) {
+ // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+ Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz
+ : Intrinsic::nvvm_ex2_approx;
+ Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
} else if (Name.starts_with("atomic.load.add.f32.p") ||
Name.starts_with("atomic.load.add.f64.p")) {
Value *Ptr = CI->getArgOperand(0);
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index dd1bc2b..3c9ab8e 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -228,11 +228,9 @@ public:
AssemblerDialect = i;
}
- void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
- bool Warning(SMLoc L, const Twine &Msg,
- SMRange Range = std::nullopt) override;
- bool printError(SMLoc L, const Twine &Msg,
- SMRange Range = std::nullopt) override;
+ void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+ bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+ bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
const AsmToken &Lex() override;
@@ -312,7 +310,7 @@ private:
void printMacroInstantiations();
void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
- SMRange Range = std::nullopt) const {
+ SMRange Range = {}) const {
ArrayRef<SMRange> Ranges(Range);
SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
}
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 1a3752f..911d92c 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -695,15 +695,15 @@ bool ELFAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) {
static MCSymbolAttr MCAttrForString(StringRef Type) {
return StringSwitch<MCSymbolAttr>(Type)
- .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
- .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
- .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
- .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
- .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
- .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
- MCSA_ELF_TypeIndFunction)
- .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
- .Default(MCSA_Invalid);
+ .Cases({"STT_FUNC", "function"}, MCSA_ELF_TypeFunction)
+ .Cases({"STT_OBJECT", "object"}, MCSA_ELF_TypeObject)
+ .Cases({"STT_TLS", "tls_object"}, MCSA_ELF_TypeTLS)
+ .Cases({"STT_COMMON", "common"}, MCSA_ELF_TypeCommon)
+ .Cases({"STT_NOTYPE", "notype"}, MCSA_ELF_TypeNoType)
+ .Cases({"STT_GNU_IFUNC", "gnu_indirect_function"},
+ MCSA_ELF_TypeIndFunction)
+ .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+ .Default(MCSA_Invalid);
}
/// parseDirectiveELFType
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 8a8f111..3a85770 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -483,11 +483,9 @@ public:
AssemblerDialect = i;
}
- void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
- bool Warning(SMLoc L, const Twine &Msg,
- SMRange Range = std::nullopt) override;
- bool printError(SMLoc L, const Twine &Msg,
- SMRange Range = std::nullopt) override;
+ void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+ bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+ bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
enum ExpandKind { ExpandMacros, DoNotExpandMacros };
const AsmToken &Lex(ExpandKind ExpandNextToken);
@@ -592,7 +590,7 @@ private:
bool expandStatement(SMLoc Loc);
void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
- SMRange Range = std::nullopt) const {
+ SMRange Range = {}) const {
ArrayRef<SMRange> Ranges(Range);
SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
}
@@ -5325,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() {
bool MasmParser::isMacroLikeDirective() {
if (getLexer().is(AsmToken::Identifier)) {
bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier())
- .CasesLower("repeat", "rept", true)
+ .CasesLower({"repeat", "rept"}, true)
.CaseLower("while", true)
- .CasesLower("for", "irp", true)
- .CasesLower("forc", "irpc", true)
+ .CasesLower({"for", "irp"}, true)
+ .CasesLower({"forc", "irpc"}, true)
.Default(false);
if (IsMacroLike)
return true;
diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp
index caf357e8..14c14f6 100644
--- a/llvm/lib/Object/WindowsMachineFlag.cpp
+++ b/llvm/lib/Object/WindowsMachineFlag.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
COFF::MachineTypes llvm::getMachineType(StringRef S) {
// Flags must be a superset of Microsoft lib.exe /machine flags.
return StringSwitch<COFF::MachineTypes>(S.lower())
- .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64)
- .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386)
+ .Cases({"x64", "amd64"}, COFF::IMAGE_FILE_MACHINE_AMD64)
+ .Cases({"x86", "i386"}, COFF::IMAGE_FILE_MACHINE_I386)
.Case("arm", COFF::IMAGE_FILE_MACHINE_ARMNT)
.Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64)
.Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC)
diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp
index 1c52e35..f9fd4af 100644
--- a/llvm/lib/Remarks/RemarkFormat.cpp
+++ b/llvm/lib/Remarks/RemarkFormat.cpp
@@ -19,7 +19,7 @@ using namespace llvm::remarks;
Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) {
auto Result = StringSwitch<Format>(FormatStr)
- .Cases("", "yaml", Format::YAML)
+ .Cases({"", "yaml"}, Format::YAML)
.Case("bitstream", Format::Bitstream)
.Default(Format::Unknown);
diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp
index 4a6b2fd..be4d1f1 100644
--- a/llvm/lib/Support/AArch64BuildAttributes.cpp
+++ b/llvm/lib/Support/AArch64BuildAttributes.cpp
@@ -67,8 +67,8 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
}
SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) {
return StringSwitch<SubsectionType>(Type)
- .Cases("uleb128", "ULEB128", ULEB128)
- .Cases("ntbs", "NTBS", NTBS)
+ .Cases({"uleb128", "ULEB128"}, ULEB128)
+ .Cases({"ntbs", "NTBS"}, NTBS)
.Default(TYPE_NOT_FOUND);
}
StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e21cf8e..e2645fa 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A,
A.precision <= B.precision;
}
-constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
-constexpr RoundingMode APFloatBase::rmTowardPositive;
-constexpr RoundingMode APFloatBase::rmTowardNegative;
-constexpr RoundingMode APFloatBase::rmTowardZero;
-constexpr RoundingMode APFloatBase::rmNearestTiesToAway;
-
/* A tight upper bound on number of parts required to hold the value
pow(5, power) is
diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp
index e376344..ea42f34 100644
--- a/llvm/lib/Support/BranchProbability.cpp
+++ b/llvm/lib/Support/BranchProbability.cpp
@@ -111,3 +111,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const {
uint64_t BranchProbability::scaleByInverse(uint64_t Num) const {
return ::scale<0>(Num, D, N);
}
+
+BranchProbability BranchProbability::pow(unsigned N) const {
+ BranchProbability Res = BranchProbability::getOne();
+ for (unsigned I = 0; I < N; ++I)
+ Res *= *this;
+ return Res;
+}
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 648d6a5..da68994 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -421,8 +421,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) {
return true;
}
- if (FilesToRemove == NULL)
+ if (FilesToRemove == NULL) {
FilesToRemove = new std::vector<std::string>;
+ std::atexit([]() {
+ delete FilesToRemove;
+ FilesToRemove = NULL;
+ });
+ }
FilesToRemove->push_back(std::string(Filename));
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 07b9989..d6f27fb 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -61,17 +61,6 @@
using namespace llvm;
-constexpr raw_ostream::Colors raw_ostream::BLACK;
-constexpr raw_ostream::Colors raw_ostream::RED;
-constexpr raw_ostream::Colors raw_ostream::GREEN;
-constexpr raw_ostream::Colors raw_ostream::YELLOW;
-constexpr raw_ostream::Colors raw_ostream::BLUE;
-constexpr raw_ostream::Colors raw_ostream::MAGENTA;
-constexpr raw_ostream::Colors raw_ostream::CYAN;
-constexpr raw_ostream::Colors raw_ostream::WHITE;
-constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR;
-constexpr raw_ostream::Colors raw_ostream::RESET;
-
raw_ostream::~raw_ostream() {
// raw_ostream's subclasses should take care to flush the buffer
// in their destructors.
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 3b510d3..f716317 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() {
raw_socket_stream::raw_socket_stream(int SocketFD)
: raw_fd_stream(SocketFD, true) {}
-raw_socket_stream::~raw_socket_stream() {}
+raw_socket_stream::~raw_socket_stream() = default;
Expected<std::unique_ptr<raw_socket_stream>>
raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 30eae6e..e8e6469 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -682,8 +682,10 @@ tgtok::TokKind TGLexer::LexExclaim() {
.Case("instances", tgtok::XInstances)
.Case("substr", tgtok::XSubstr)
.Case("find", tgtok::XFind)
- .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
- .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+ .Cases({"setdagop", "setop"},
+ tgtok::XSetDagOp) // !setop is deprecated.
+ .Cases({"getdagop", "getop"},
+ tgtok::XGetDagOp) // !getop is deprecated.
.Case("setdagopname", tgtok::XSetDagOpName)
.Case("getdagopname", tgtok::XGetDagOpName)
.Case("getdagarg", tgtok::XGetDagArg)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fede586..655e818 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -308,9 +308,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
}
-bool AArch64TTIImpl::areTypesABICompatible(
- const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Types) const {
+bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
+ const Function *Callee,
+ ArrayRef<Type *> Types) const {
if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
return false;
@@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEorStreamingSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index fe2e849..b39546a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -84,7 +84,7 @@ public:
const Function *Callee) const override;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Types) const override;
+ ArrayRef<Type *> Types) const override;
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aed325c..0c97741 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -224,13 +224,12 @@ bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
Register VCCReg = I.getOperand(1).getReg();
MachineInstr *Cmp;
- if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Set SCC as a side effect with S_CMP or S_OR.
+ if (STI.hasScalarCompareEq64()) {
unsigned CmpOpc =
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
} else {
- // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64
- // which sets SCC as a side effect.
Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
.addReg(VCCReg)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589e..d7d0292 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding {
static constexpr unsigned BitsPerField = 2;
static constexpr unsigned NumFields = 4;
static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+ static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+ static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
using ModeType = PackedVector<unsigned, BitsPerField,
std::bitset<BitsPerField * NumFields>>;
@@ -82,12 +84,12 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ // Current basic block.
+ MachineBasicBlock *MBB;
+
/// Most recent s_set_* instruction.
MachineInstr *MostRecentModeSet;
- /// Whether the current mode is known.
- bool CurrentModeKnown;
-
/// Current mode bits.
ModeTy CurrentMode;
@@ -108,10 +110,13 @@ private:
MachineInstr *Clause;
/// Insert mode change before \p I. \returns true if mode was changed.
- bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+ bool setMode(ModeTy NewMode, ModeTy Mask,
+ MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
- void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+ void resetMode(MachineBasicBlock::instr_iterator I) {
+ setMode(ModeTy(), ModeTy::fullMask(), I);
+ }
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -130,38 +135,43 @@ private:
/// Check if an instruction \p I is within a clause and returns a suitable
/// iterator to insert mode change. It may also modify the S_CLAUSE
/// instruction to extend it or drop the clause if it cannot be adjusted.
- MachineInstr *handleClause(MachineInstr *I);
+ MachineBasicBlock::instr_iterator
+ handleClause(MachineBasicBlock::instr_iterator I);
};
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
- MachineInstr *I) {
+ MachineBasicBlock::instr_iterator I) {
assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
- if (CurrentModeKnown) {
- auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+ auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
- if ((Delta & Mask.raw_bits()).none()) {
- CurrentMask |= Mask;
- return false;
- }
+ if ((Delta & Mask.raw_bits()).none()) {
+ CurrentMask |= Mask;
+ return false;
+ }
- if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
- CurrentMode |= NewMode;
- CurrentMask |= Mask;
+ if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+ CurrentMode |= NewMode;
+ CurrentMask |= Mask;
- MostRecentModeSet->getOperand(0).setImm(CurrentMode);
- return true;
- }
+ MachineOperand &Op = MostRecentModeSet->getOperand(0);
+
+ // Carry old mode bits from the existing instruction.
+ int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+
+ Op.setImm(CurrentMode | OldModeBits);
+ return true;
}
+ // Record previous mode into high 8 bits of the immediate.
+ int64_t OldModeBits = CurrentMode << ModeWidth;
+
I = handleClause(I);
- MostRecentModeSet =
- BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
- .addImm(NewMode);
+ MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode | OldModeBits);
CurrentMode = NewMode;
CurrentMask = Mask;
- CurrentModeKnown = true;
return true;
}
@@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
if (Ops.first) {
ModeTy NewMode, Mask;
computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
- return setMode(NewMode, Mask, &MI);
+ return setMode(NewMode, Mask, MI.getIterator());
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
return false;
}
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
if (!ClauseRemaining)
return I;
// A clause cannot start with a special instruction, place it right before
// the clause.
if (ClauseRemaining == ClauseLen) {
- I = Clause->getPrevNode();
+ I = Clause->getPrevNode()->getIterator();
assert(I->isBundle());
return I;
}
@@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
ClauseLen = ClauseRemaining = 0;
CurrentMode.reset();
CurrentMask.reset();
- CurrentModeKnown = true;
for (auto &MBB : MF) {
MostRecentModeSet = nullptr;
+ this->MBB = &MBB;
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
if (MI.isMetaInstruction())
@@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
if (MI.isTerminator() || MI.isCall()) {
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
CurrentMode.reset();
- CurrentModeKnown = true;
- } else
- resetMode(&MI);
+ else
+ resetMode(MI.getIterator());
continue;
}
if (MI.isInlineAsm()) {
if (TII->hasVGPRUses(MI))
- resetMode(&MI);
+ resetMode(MI.getIterator());
continue;
}
@@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
--ClauseRemaining;
}
- // If we're falling through to a block that has at least one other
- // predecessor, we no longer know the mode.
- MachineBasicBlock *Next = MBB.getNextNode();
- if (Next && Next->pred_size() >= 2 &&
- llvm::is_contained(Next->predecessors(), &MBB)) {
- if (CurrentMode.raw_bits().any())
- CurrentModeKnown = false;
- }
+ // Reset the mode if we are falling through.
+ resetMode(MBB.instr_end());
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb..844649ebb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
*OutStreamer);
if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
- unsigned V = MI->getOperand(0).getImm();
+ unsigned V = MI->getOperand(0).getImm() & 0xff;
OutStreamer->AddComment(
" msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
" src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e187959..907f830 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
using namespace llvm;
using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
namespace {
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (RAL)
+ Register RALSrc;
+ if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
return RALSrc;
+ // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+ // AextSrc = G_TRUNC TruncSrc
+ // Src = G_ANYEXT AextSrc
+ if (mi_match(Src, MRI,
+ m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+ return RALSrc;
+ }
+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b84c30e..dc8fa7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
MI.eraseFromParent();
}
+void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == V2S16);
+ auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
+ unsigned Opc = MI.getOpcode();
+ auto Flags = MI.getFlags();
+ auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
+ auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
+ auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
+ auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+}
+
void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
@@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackBitShift(MI);
case UnpackMinMax:
return lowerUnpackMinMax(MI);
+ case ScalarizeToS16:
+ return lowerSplitTo16(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
@@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
+ case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
@@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
@@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index ad3ff1d..e7598f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
static constexpr LLT P6 = LLT::pointer(6, 32);
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+ MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
@@ -121,6 +122,7 @@ private:
void lowerV_BFE(MachineInstr &MI);
void lowerS_BFE(MachineInstr &MI);
void lowerSplitTo32(MachineInstr &MI);
+ void lowerSplitTo16(MachineInstr &MI);
void lowerSplitTo32Select(MachineInstr &MI);
void lowerSplitTo32SExtInReg(MachineInstr &MI);
void lowerUnpackMinMax(MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 01abd35..103cdec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -913,14 +913,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
- addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+ addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+ .Uni(S64, {{Sgpr64}, {}});
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+ hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 030bd75..e6df5d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
V4S32,
UniV2S16,
+ UniV2S32,
DivV2S16,
+ DivV2S32,
// B types
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
+ UniInVgprS64,
UniInVgprV2S16,
+ UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
@@ -217,6 +221,7 @@ enum LoweringMethodID {
V_BFE,
VgprToVccCopy,
SplitTo32,
+ ScalarizeToS16,
SplitTo32Select,
SplitTo32SExtInReg,
Ext32To64,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b28c50e..b87b54f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -816,7 +816,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
Params.consume_front("strategy=");
auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
.Case("dpp", ScanOptions::DPP)
- .Cases("iterative", "", ScanOptions::Iterative)
+ .Cases({"iterative", ""}, ScanOptions::Iterative)
.Case("none", ScanOptions::None)
.Default(std::nullopt);
if (Result)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb..28b4da8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
- VgprMSBs = Inst.getOperand(0).getImm();
+ VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
else if (isTerminator(Inst))
VgprMSBs = 0;
}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d950131..65dce74 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ // Set VADDR4 to NULL
+ let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
// set to 0 based on SPG.
- let vaddr4 = 0;
let rsrc = 0;
let vdata = 0;
let d16 = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..b7fa899 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ // On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
+ auto applyPendingXcntGroup = [this](unsigned E) {
+ unsigned LowerBound = getScoreLB(X_CNT);
+ applyWaitcnt(X_CNT, 0);
+ PendingEvents |= (1 << E);
+ setScoreLB(X_CNT, LowerBound);
+ };
+
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
+ if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+ if (hasPendingEvent(VMEM_GROUP))
+ applyPendingXcntGroup(VMEM_GROUP);
+ else
+ applyWaitcnt(X_CNT, 0);
+ return;
+ }
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ !hasPendingEvent(STORE_CNT)) {
+ if (hasPendingEvent(SMEM_GROUP))
+ applyPendingXcntGroup(SMEM_GROUP);
+ else
+ applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ return;
+ }
applyWaitcnt(X_CNT, Wait.XCnt);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21..d9f76c9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10618,6 +10618,42 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
return false;
}
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ const SIRegisterInfo &RI) {
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+ SCCRedefine->getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+ if (MachineOperand *SccDef =
+ SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ SCCRedefine->eraseFromParent();
+ return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+ if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+ Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+ return false;
+ bool Op1IsNonZeroImm =
+ Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+ if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+ return false;
+ return true;
+}
+
bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Register SrcReg2, int64_t CmpMask,
int64_t CmpValue,
@@ -10637,19 +10673,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
- const auto foldableSelect = [](MachineInstr *Def) -> bool {
- if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
- bool Op1IsNonZeroImm =
- Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
- bool Op2IsZeroImm =
- Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
- if (Op1IsNonZeroImm && Op2IsZeroImm)
- return true;
- }
- return false;
- };
-
// For S_OP that set SCC = DST!=0, do the transformation
//
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
@@ -10660,24 +10683,12 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
//
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
// imm), 0)
- if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+ if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
+ if (!optimizeSCC(Def, &CmpInstr, RI))
+ return false;
- if (MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
return true;
};
@@ -10755,21 +10766,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
-
- MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
+ if (!optimizeSCC(Def, &CmpInstr, RI))
+ return false;
if (!MRI->use_nodbg_empty(DefReg)) {
assert(!IsReversedCC);
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ebfa593..bf7c962f 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
// Only use a specialized AEABI function if the default version of this
// Libcall is an AEABI function.
- if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
- return SDValue();
-
+ //
// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
// able to translate memset to memclr and use the value to index the function
// name array.
@@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
} AEABILibcall;
switch (LC) {
case RTLIB::MEMCPY:
+ if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy)
+ return SDValue();
+
AEABILibcall = AEABI_MEMCPY;
break;
case RTLIB::MEMMOVE:
+ if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove)
+ return SDValue();
+
AEABILibcall = AEABI_MEMMOVE;
break;
case RTLIB::MEMSET:
+ if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset)
+ return SDValue();
+
AEABILibcall = AEABI_MEMSET;
if (isNullConstant(Src))
AEABILibcall = AEABI_MEMCLR;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f60660b..1bb670d 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser {
VPTState.CurPosition = ~0U;
}
- void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+ void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) {
return getParser().Note(L, Msg, Range);
}
- bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+ bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) {
return getParser().Warning(L, Msg, Range);
}
- bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) {
return getParser().Error(L, Msg, Range);
}
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e5b4f6e..08f196b 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -884,13 +884,13 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
.Case("{t4}", CSKY::R20)
.Case("{t5}", CSKY::R21)
.Case("{t6}", CSKY::R22)
- .Cases("{t7}", "{fp}", CSKY::R23)
- .Cases("{t8}", "{top}", CSKY::R24)
- .Cases("{t9}", "{bsp}", CSKY::R25)
+ .Cases({"{t7}", "{fp}"}, CSKY::R23)
+ .Cases({"{t8}", "{top}"}, CSKY::R24)
+ .Cases({"{t9}", "{bsp}"}, CSKY::R25)
.Case("{r26}", CSKY::R26)
.Case("{r27}", CSKY::R27)
- .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
- .Cases("{tb}", "{rtb}", CSKY::R29)
+ .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28)
+ .Cases({"{tb}", "{rtb}"}, CSKY::R29)
.Case("{svbr}", CSKY::R30)
.Case("{tls}", CSKY::R31)
.Default(CSKY::NoRegister);
@@ -907,38 +907,38 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// use the ABI names in register constraint lists.
if (Subtarget.useHardFloat()) {
unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
- .Cases("{fr0}", "{vr0}", CSKY::F0_32)
- .Cases("{fr1}", "{vr1}", CSKY::F1_32)
- .Cases("{fr2}", "{vr2}", CSKY::F2_32)
- .Cases("{fr3}", "{vr3}", CSKY::F3_32)
- .Cases("{fr4}", "{vr4}", CSKY::F4_32)
- .Cases("{fr5}", "{vr5}", CSKY::F5_32)
- .Cases("{fr6}", "{vr6}", CSKY::F6_32)
- .Cases("{fr7}", "{vr7}", CSKY::F7_32)
- .Cases("{fr8}", "{vr8}", CSKY::F8_32)
- .Cases("{fr9}", "{vr9}", CSKY::F9_32)
- .Cases("{fr10}", "{vr10}", CSKY::F10_32)
- .Cases("{fr11}", "{vr11}", CSKY::F11_32)
- .Cases("{fr12}", "{vr12}", CSKY::F12_32)
- .Cases("{fr13}", "{vr13}", CSKY::F13_32)
- .Cases("{fr14}", "{vr14}", CSKY::F14_32)
- .Cases("{fr15}", "{vr15}", CSKY::F15_32)
- .Cases("{fr16}", "{vr16}", CSKY::F16_32)
- .Cases("{fr17}", "{vr17}", CSKY::F17_32)
- .Cases("{fr18}", "{vr18}", CSKY::F18_32)
- .Cases("{fr19}", "{vr19}", CSKY::F19_32)
- .Cases("{fr20}", "{vr20}", CSKY::F20_32)
- .Cases("{fr21}", "{vr21}", CSKY::F21_32)
- .Cases("{fr22}", "{vr22}", CSKY::F22_32)
- .Cases("{fr23}", "{vr23}", CSKY::F23_32)
- .Cases("{fr24}", "{vr24}", CSKY::F24_32)
- .Cases("{fr25}", "{vr25}", CSKY::F25_32)
- .Cases("{fr26}", "{vr26}", CSKY::F26_32)
- .Cases("{fr27}", "{vr27}", CSKY::F27_32)
- .Cases("{fr28}", "{vr28}", CSKY::F28_32)
- .Cases("{fr29}", "{vr29}", CSKY::F29_32)
- .Cases("{fr30}", "{vr30}", CSKY::F30_32)
- .Cases("{fr31}", "{vr31}", CSKY::F31_32)
+ .Cases({"{fr0}", "{vr0}"}, CSKY::F0_32)
+ .Cases({"{fr1}", "{vr1}"}, CSKY::F1_32)
+ .Cases({"{fr2}", "{vr2}"}, CSKY::F2_32)
+ .Cases({"{fr3}", "{vr3}"}, CSKY::F3_32)
+ .Cases({"{fr4}", "{vr4}"}, CSKY::F4_32)
+ .Cases({"{fr5}", "{vr5}"}, CSKY::F5_32)
+ .Cases({"{fr6}", "{vr6}"}, CSKY::F6_32)
+ .Cases({"{fr7}", "{vr7}"}, CSKY::F7_32)
+ .Cases({"{fr8}", "{vr8}"}, CSKY::F8_32)
+ .Cases({"{fr9}", "{vr9}"}, CSKY::F9_32)
+ .Cases({"{fr10}", "{vr10}"}, CSKY::F10_32)
+ .Cases({"{fr11}", "{vr11}"}, CSKY::F11_32)
+ .Cases({"{fr12}", "{vr12}"}, CSKY::F12_32)
+ .Cases({"{fr13}", "{vr13}"}, CSKY::F13_32)
+ .Cases({"{fr14}", "{vr14}"}, CSKY::F14_32)
+ .Cases({"{fr15}", "{vr15}"}, CSKY::F15_32)
+ .Cases({"{fr16}", "{vr16}"}, CSKY::F16_32)
+ .Cases({"{fr17}", "{vr17}"}, CSKY::F17_32)
+ .Cases({"{fr18}", "{vr18}"}, CSKY::F18_32)
+ .Cases({"{fr19}", "{vr19}"}, CSKY::F19_32)
+ .Cases({"{fr20}", "{vr20}"}, CSKY::F20_32)
+ .Cases({"{fr21}", "{vr21}"}, CSKY::F21_32)
+ .Cases({"{fr22}", "{vr22}"}, CSKY::F22_32)
+ .Cases({"{fr23}", "{vr23}"}, CSKY::F23_32)
+ .Cases({"{fr24}", "{vr24}"}, CSKY::F24_32)
+ .Cases({"{fr25}", "{vr25}"}, CSKY::F25_32)
+ .Cases({"{fr26}", "{vr26}"}, CSKY::F26_32)
+ .Cases({"{fr27}", "{vr27}"}, CSKY::F27_32)
+ .Cases({"{fr28}", "{vr28}"}, CSKY::F28_32)
+ .Cases({"{fr29}", "{vr29}"}, CSKY::F29_32)
+ .Cases({"{fr30}", "{vr30}"}, CSKY::F30_32)
+ .Cases({"{fr31}", "{vr31}"}, CSKY::F31_32)
.Default(CSKY::NoRegister);
if (FReg != CSKY::NoRegister) {
assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg");
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
index 15def36..b6bbb20 100644
--- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
}
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXAsmPrinter() {
RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget());
}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index bcf8440..84b1a31 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -53,7 +53,8 @@
using namespace llvm;
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTarget() {
RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
auto *PR = PassRegistry::getPassRegistry();
initializeDXILIntrinsicExpansionLegacyPass(*PR);
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 9a14c01..62ad014 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) {
static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); }
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetMC() {
Target &T = getTheDirectXTarget();
RegisterMCAsmInfo<DirectXMCAsmInfo> X(T);
TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo);
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
index ae01626..934bd1b 100644
--- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
@@ -24,7 +24,8 @@ Target &getTheDirectXTarget() {
using namespace llvm;
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetInfo() {
RegisterTarget<Triple::dxil, /*HasJIT=*/false> X(
getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL");
}
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index f4e36fa7..e661c94 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -26,6 +26,7 @@ def tc_20a4bbec : InstrItinClass;
def tc_227864f7 : InstrItinClass;
def tc_257f6f7c : InstrItinClass;
def tc_26a377fe : InstrItinClass;
+def tc_2a698a03 : InstrItinClass;
def tc_2b4c548e : InstrItinClass;
def tc_2c745bb8 : InstrItinClass;
def tc_2d4051cd : InstrItinClass;
@@ -52,6 +53,7 @@ def tc_561aaa58 : InstrItinClass;
def tc_56c4f9fe : InstrItinClass;
def tc_56e64202 : InstrItinClass;
def tc_58d21193 : InstrItinClass;
+def tc_57a4709c : InstrItinClass;
def tc_5bf8afbb : InstrItinClass;
def tc_5cdf8c84 : InstrItinClass;
def tc_61bf7c03 : InstrItinClass;
@@ -220,6 +222,11 @@ class DepHVXItinV55 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -356,6 +363,11 @@ class DepHVXItinV55 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -812,6 +824,11 @@ class DepHVXItinV60 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -948,6 +965,11 @@ class DepHVXItinV60 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1404,6 +1426,11 @@ class DepHVXItinV62 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -1540,6 +1567,11 @@ class DepHVXItinV62 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1996,6 +2028,11 @@ class DepHVXItinV65 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2132,6 +2169,11 @@ class DepHVXItinV65 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -2588,6 +2630,11 @@ class DepHVXItinV66 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2724,6 +2771,11 @@ class DepHVXItinV66 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3180,6 +3232,11 @@ class DepHVXItinV67 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3316,6 +3373,11 @@ class DepHVXItinV67 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3772,6 +3834,11 @@ class DepHVXItinV68 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3908,6 +3975,11 @@ class DepHVXItinV68 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4364,6 +4436,11 @@ class DepHVXItinV69 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -4500,6 +4577,11 @@ class DepHVXItinV69 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4956,6 +5038,11 @@ class DepHVXItinV71 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5092,6 +5179,11 @@ class DepHVXItinV71 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -5548,6 +5640,11 @@ class DepHVXItinV73 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5684,6 +5781,11 @@ class DepHVXItinV73 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6140,6 +6242,11 @@ class DepHVXItinV75 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6276,6 +6383,11 @@ class DepHVXItinV75 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6732,6 +6844,11 @@ class DepHVXItinV79 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6868,6 +6985,11 @@ class DepHVXItinV79 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -7324,6 +7446,11 @@ class DepHVXItinV81 {
InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
[HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+ InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
[InstrStage<1, [SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -7460,6 +7587,11 @@ class DepHVXItinV81 {
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
[HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+ InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
[InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
InstrStage<1, [CVI_XLANE]>], [9, 2],
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index f8f1c2a..b188134 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -29939,6 +29939,58 @@ let opNewValue = 0;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vabs_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vabs_sf : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32),
@@ -31302,6 +31354,21 @@ let isPseudo = 1;
let isCodeGenOnly = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_valign4 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign4($Vu32,$Vv32,$Rt8)",
+tc_57a4709c, TypeCVI_VA>, Enc_a30110, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_valignb : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -32583,6 +32650,32 @@ let isCVI = 1;
let hasHvxTmp = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vconv_bf_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32),
+"$Vd32.bf = $Vuu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_f8_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.f8 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vconv_h_hf : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32),
@@ -32596,6 +32689,19 @@ let opNewValue = 0;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vconv_h_hf_rnd : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.h = $Vu32.hf:rnd",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vconv_hf_h : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32),
@@ -32635,6 +32741,71 @@ let opNewValue = 0;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vconv_qf16_f8 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.qf16 = $Vu32.f8",
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.hf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.sf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vconv_sf_qf32 : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32),
@@ -33720,6 +33891,122 @@ let isHVXALU2SRC = 1;
let DecoderNamespace = "EXT_mmvec";
let Constraints = "$Qx4 = $Qx4in";
}
+def V6_veqhf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqhf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqsf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
def V6_veqw : HInst<
(outs HvxQR:$Qd4),
(ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -34538,6 +34825,58 @@ let Inst{31-24} = 0b00011110;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vilog2_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vinsertwr : HInst<
(outs HvxVR:$Vx32),
(ins HvxVR:$Vx32in, IntRegs:$Rt32),
@@ -37170,6 +37509,58 @@ let isCVI = 1;
let isHVXALU = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vneg_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vnormamth : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 23f4b3a..c11483b 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3830,6 +3830,122 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
// V81 HVX Instructions.
+def: Pat<(int_hexagon_V6_vabs_qf16_hf HvxVR:$src1),
+ (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_hf_128B HvxVR:$src1),
+ (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16 HvxVR:$src1),
+ (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16_128B HvxVR:$src1),
+ (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32 HvxVR:$src1),
+ (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32_128B HvxVR:$src1),
+ (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf HvxVR:$src1),
+ (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf_128B HvxVR:$src1),
+ (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valign4_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+ (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32 HvxWR:$src1),
+ (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32_128B HvxWR:$src1),
+ (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16 HvxVR:$src1),
+ (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16_128B HvxVR:$src1),
+ (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd HvxVR:$src1),
+ (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd_128B HvxVR:$src1),
+ (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8 HvxVR:$src1),
+ (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8_128B HvxVR:$src1),
+ (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf HvxVR:$src1),
+ (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf_128B HvxVR:$src1),
+ (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16 HvxVR:$src1),
+ (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16_128B HvxVR:$src1),
+ (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32 HvxVR:$src1),
+ (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32_128B HvxVR:$src1),
+ (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf HvxVR:$src1),
+ (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf_128B HvxVR:$src1),
+ (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf HvxVR:$src1, HvxVR:$src2),
+ (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf HvxVR:$src1, HvxVR:$src2),
+ (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+ (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf HvxVR:$src1),
+ (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf_128B HvxVR:$src1),
+ (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16 HvxVR:$src1),
+ (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16_128B HvxVR:$src1),
+ (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32 HvxVR:$src1),
+ (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32_128B HvxVR:$src1),
+ (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf HvxVR:$src1),
+ (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf_128B HvxVR:$src1),
+ (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf HvxVR:$src1),
+ (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf_128B HvxVR:$src1),
+ (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16 HvxVR:$src1),
+ (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16_128B HvxVR:$src1),
+ (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32 HvxVR:$src1),
+ (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32_128B HvxVR:$src1),
+ (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf HvxVR:$src1),
+ (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf_128B HvxVR:$src1),
+ (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
(V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 85ce944..e40dbd2 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in {
(C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
}
+multiclass FloatClass<SDPatternOperator IntOp, InstHexagon MI,
+ PatFrag RegPred> {
+ let AddedComplexity = 100 in {
+ def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+ (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>;
+ def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+ (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>;
+ }
+}
+
+defm : FloatClass<int_hexagon_F2_sfclass, F2_sfclass, F32>;
+defm : FloatClass<int_hexagon_F2_dfclass, F2_dfclass, F64>;
+
def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I),
(PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a6de839..904aabed 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -371,6 +371,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
ISD::SETUGE, ISD::SETUGT},
VT, Expand);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
}
setOperationAction(ISD::CTPOP, GRLenVT, Legal);
setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal);
@@ -453,6 +457,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
ISD::SETUGE, ISD::SETUGT},
VT, Expand);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
}
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ca4ee5f..610ba05 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2424,6 +2424,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
(XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>;
+// Vector floating-point conversion
+defm : PatXrF<fceil, "XVFRINTRP">;
+defm : PatXrF<ffloor, "XVFRINTRM">;
+defm : PatXrF<ftrunc, "XVFRINTRZ">;
+defm : PatXrF<froundeven, "XVFRINTRNE">;
+
// load
def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
(XVLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 92402ba..6470842 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2552,6 +2552,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)),
(f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D
(SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>;
+defm : PatVrF<fceil, "VFRINTRP">;
+defm : PatVrF<ffloor, "VFRINTRM">;
+defm : PatVrF<ftrunc, "VFRINTRZ">;
+defm : PatVrF<froundeven, "VFRINTRNE">;
+
// load
def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
(VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index e37f3a66..fb5cd5c2 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -690,9 +690,9 @@ bool M68kAsmParser::parseRegisterName(MCRegister &RegNo, SMLoc Loc,
} else {
// Floating point control register.
RegNo = StringSwitch<unsigned>(RegisterNameLower)
- .Cases("fpc", "fpcr", M68k::FPC)
- .Cases("fps", "fpsr", M68k::FPS)
- .Cases("fpi", "fpiar", M68k::FPIAR)
+ .Cases({"fpc", "fpcr"}, M68k::FPC)
+ .Cases({"fps", "fpsr"}, M68k::FPS)
+ .Cases({"fpi", "fpiar"}, M68k::FPIAR)
.Default(M68k::NoRegister);
assert(RegNo != M68k::NoRegister &&
"Unrecognized FP control register name");
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index fe83dc6..51bafe4 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -49,7 +49,7 @@ public:
M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI)
: MCAsmBackend(llvm::endianness::big),
Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU())
- .CasesLower("m68020", "m68030", "m68040", true)
+ .CasesLower({"m68020", "m68030", "m68040"}, true)
.Default(false)) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 97379d7..f588e56 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -6176,7 +6176,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
CC = StringSwitch<unsigned>(Name)
.Case("zero", 0)
- .Cases("at", "AT", 1)
+ .Cases({"at", "AT"}, 1)
.Case("a0", 4)
.Case("a1", 5)
.Case("a2", 6)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7e7ee75..c667a09 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
: (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
-#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
- [&]() -> auto { \
- if (is_mc && is_ch) \
- return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
- if (is_ch) \
- return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
- if (is_mc) \
- return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
- return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
- }()
-
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
bool IsShared32,
bool IsCacheHint,
@@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
}
}
-static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
- bool IsMultiCast,
- bool IsCacheHint, bool IsIm2Col) {
- if (IsIm2Col) {
- switch (Dim) {
- case 3:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
- IsCacheHint, IsShared32);
- case 4:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
- IsCacheHint, IsShared32);
- case 5:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
- IsCacheHint, IsShared32);
- default:
- llvm_unreachable("Invalid Dimension in im2col mode for "
- "GetCpAsyncBulkTensorG2SOpcode.");
- }
- } else {
- switch (Dim) {
- case 1:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
- IsCacheHint, IsShared32);
- case 2:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
- IsCacheHint, IsShared32);
- case 3:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
- IsCacheHint, IsShared32);
- case 4:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
- IsCacheHint, IsShared32);
- case 5:
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
- IsCacheHint, IsShared32);
- default:
- llvm_unreachable(
- "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
- }
- }
-}
-
-static size_t GetDimsFromIntrinsic(unsigned IID) {
- switch (IID) {
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
- return 3;
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
- return 4;
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
- return 5;
- default:
- llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
- }
-}
-
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
- bool IsIm2Col) {
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
- // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
- // multicast, cache_hint,
- // multicast_flag, cache_hint_flag, cta_group_flag}
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
- // = {2} + {8 + dims + im2col_offsets}
- size_t NumOps = N->getNumOperands();
- size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
- : (NumOps - 10);
- // Offsets is always 'NumDims - 2' and only for im2col mode
- size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
- bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
- bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
- size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
- size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
-
- unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
- if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
- report_fatal_error(
- formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
- Subtarget->getSmVersion()));
-
- SDLoc DL(N);
- SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
-
- // Push MultiCast operand, if available
- if (IsMultiCast)
- Ops.push_back(N->getOperand(MultiCastIdx));
-
- // Push CacheHint operand, if available
- if (IsCacheHint)
- Ops.push_back(N->getOperand(MultiCastIdx + 1));
-
- // Flag for CTA Group
- Ops.push_back(getI32Imm(CTAGroupVal, DL));
-
- // Finally, the chain operand
- Ops.push_back(N->getOperand(0));
-
- bool IsShared32 =
- CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
- unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
- NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
- ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
unsigned RedOp,
bool IsIm2Col) {
@@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
switch (IID) {
default:
return false;
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
- SelectCpAsyncBulkTensorG2SCommon(N);
- return true;
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
- SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
- return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c912e70..1cb579b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -86,7 +86,6 @@ private:
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
- void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
bool IsIm2Col = false);
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cc..b260221 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">;
-def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c923f0e..50827bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> {
string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
}
-// From Global to Shared memory (G2S)
-class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
- string prefix = "cp.async.bulk.tensor";
- string dir = "shared::cluster.global";
- string completion = "mbarrier::complete_tx::bytes";
- string inst_name = prefix
- # "." # dim # "d"
- # "." # dir
- # "." # mode
- # "." # completion
- # !if(mc, ".multicast::cluster", "")
- # !if(ch, ".L2::cache_hint", "");
- string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_"
- # dim # "D"
- # !if(is_shared32, "_SHARED32", "")
- # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
-}
-
def CTAGroupFlags : Operand<i32> {
let PrintMethod = "printCTAGroup";
}
-multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
- defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
- defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
- defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
- defvar rc = !if(is_shared32, B32, B64);
-
- defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
- defvar im2col_dag = !if(!eq(mode, "im2col"),
- !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
- (ins));
- defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
- defvar im2col_asm_str = ", {{" # im2col_str # "}}";
-
- defvar asm_str = !if(!eq(mode, "im2col"),
- !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
+def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>;
+def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>;
- def "" : NVPTXInst<(outs),
- !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def _MC : NVPTXInst<(outs),
- !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
- (ins B16:$mc, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def _CH : NVPTXInst<(outs),
- !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
- (ins B64:$ch, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def _MC_CH : NVPTXInst<(outs),
- !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
- (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
- Requires<[hasPTX<80>, hasSM<90>]>;
-}
-
-foreach dim = [1, 2, 3, 4, 5] in {
- foreach shared32 = [true, false] in {
- foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
- defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name :
- CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>;
- }
- }
-}
-
-multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred,
+ TImmLeaf cta_group_type = tma_cta_group_imm_any> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
@@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
!setdagop(dims_dag, intr),
!setdagop(im2col_dag, intr),
(intr B16:$mc, B64:$ch));
- defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
- defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
- defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
- defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+ defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg));
+ defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg));
+ defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg));
+ defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg));
def "" : NVPTXInst<(outs), ins_dag,
inst_name # asm_str # ";",
@@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
[intr_dag_with_mc_ch]>,
Requires<pred>;
}
+
+foreach dim = 1...5 in {
+ defm TMA_G2S_TILE_CG0_ # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>],
+ tma_cta_group_imm0>;
+ defm TMA_G2S_TILE_ # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, "tile",
+ [callSubtarget<"hasTMABlackwellSupport">]>;
+}
foreach dim = 3...5 in {
+ defm TMA_G2S_IM2COL_CG0_ # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>],
+ tma_cta_group_imm0>;
+ defm TMA_G2S_IM2COL_ # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, "im2col",
+ [callSubtarget<"hasTMABlackwellSupport">]>;
foreach mode = ["im2col_w", "im2col_w_128"] in {
defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
- : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+ : TMA_TENSOR_G2S_INTR<dim, mode,
+ [callSubtarget<"hasTMABlackwellSupport">]>;
}
}
defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
- [hasTMACTAGroupSupport]>;
+ [callSubtarget<"hasTMABlackwellSupport">]>;
multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
@@ -784,7 +740,8 @@ foreach dim = 3...5 in {
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
- : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128",
+ [callSubtarget<"hasTMABlackwellSupport">]>;
}
defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
[hasPTX<86>, hasSM<100>]>;
@@ -835,7 +792,7 @@ foreach dim = 1...5 in {
}
}
defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
- [hasTMACTAGroupSupport]>;
+ [callSubtarget<"hasTMABlackwellSupport">]>;
def TMAReductionFlags : Operand<i32> {
let PrintMethod = "printTmaReductionMode";
@@ -930,11 +887,11 @@ foreach dim = 3...5 in {
foreach mode = ["im2col_w", "im2col_w_128"] in {
defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
- [hasTMACTAGroupSupport]>;
+ [callSubtarget<"hasTMABlackwellSupport">]>;
}
}
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
- [hasTMACTAGroupSupport]>;
+ [callSubtarget<"hasTMABlackwellSupport">]>;
//Prefetchu and Prefetch
@@ -1605,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a), (CVT_f64_f64 $a, CvtSAT)>;
// Exp2 Log2
//
-def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>;
-def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>;
let Predicates = [hasPTX<70>, hasSM<75>] in {
- def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>;
- def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>;
+ def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>;
+ def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>;
+}
+
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+ def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>;
+ def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>;
}
def LG2_APPROX_f32 :
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 194dbdc..021b1f6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -166,18 +166,15 @@ public:
// f32x2 instructions in Blackwell family
bool hasF32x2Instructions() const;
- // TMA G2S copy with cta_group::1/2 support
- bool hasCpAsyncBulkTensorCTAGroupSupport() const {
- // TODO: Update/tidy-up after the family-conditional support arrives
- switch (FullSmVersion) {
- case 1003:
- case 1013:
- return PTXVersion >= 86;
- case 1033:
- return PTXVersion >= 88;
- default:
- return false;
- }
+ // Checks support for following in TMA:
+ // - cta_group::1/2 support
+ // - im2col_w/w_128 mode support
+ // - tile_gather4 mode support
+ // - tile_scatter4 mode support
+ bool hasTMABlackwellSupport() const {
+ return hasPTXWithFamilySMs(90, {100, 110}) ||
+ hasPTXWithFamilySMs(88, {100, 101}) ||
+ hasPTXWithAccelSMs(86, {100, 101});
}
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 729c077..64593e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,
// answer. These include:
//
// - nvvm_cos_approx_{f,ftz_f}
- // - nvvm_ex2_approx_{d,f,ftz_f}
+ // - nvvm_ex2_approx(_ftz)
// - nvvm_lg2_approx_{d,f,ftz_f}
// - nvvm_sin_approx_{f,ftz_f}
// - nvvm_sqrt_approx_{f,ftz_f}
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bcb3f50..780e124 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2702,7 +2702,7 @@ static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
return StringSwitch<bool>(GV->getName())
- .Cases("llvm.global_ctors", "llvm.global_dtors", true)
+ .Cases({"llvm.global_ctors", "llvm.global_dtors"}, true)
.Default(false);
}
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 3640d25..70df59d 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
// useless and possible to break some original well-form addressing mode
// to make this pre-inc prep for it.
if (PointerElementType->isIntegerTy(64)) {
- const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
+ const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LARSCEV || LARSCEV->getLoop() != L)
return false;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2fba090..b04e887 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -912,7 +912,7 @@ bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
const Function *Callee,
- const ArrayRef<Type *> &Types) const {
+ ArrayRef<Type *> Types) const {
// We need to ensure that argument promotion does not
// attempt to promote pointers to MMA types (__vector_pair
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 475472a..8d7f255 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -147,7 +147,7 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const override;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Types) const override;
+ ArrayRef<Type *> Types) const override;
bool supportsTailCallFor(const CallBase *CB) const override;
private:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 56881f7..e0cf739 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19794,7 +19794,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index,
// LLVM's legalization take care of the splitting.
// FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet.
Index = DAG.getNode(ISD::SIGN_EXTEND, DL,
- IndexVT.changeVectorElementType(XLenVT), Index);
+ EVT::getVectorVT(*DAG.getContext(), XLenVT,
+ IndexVT.getVectorElementCount()),
+ Index);
}
IndexType = ISD::UNSIGNED_SCALED;
return true;
@@ -23944,7 +23946,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
.Case("{t0}", RISCV::X5)
.Case("{t1}", RISCV::X6)
.Case("{t2}", RISCV::X7)
- .Cases("{s0}", "{fp}", RISCV::X8)
+ .Cases({"{s0}", "{fp}"}, RISCV::X8)
.Case("{s1}", RISCV::X9)
.Case("{a0}", RISCV::X10)
.Case("{a1}", RISCV::X11)
@@ -23981,38 +23983,38 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// use the ABI names in register constraint lists.
if (Subtarget.hasStdExtF()) {
unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
- .Cases("{f0}", "{ft0}", RISCV::F0_F)
- .Cases("{f1}", "{ft1}", RISCV::F1_F)
- .Cases("{f2}", "{ft2}", RISCV::F2_F)
- .Cases("{f3}", "{ft3}", RISCV::F3_F)
- .Cases("{f4}", "{ft4}", RISCV::F4_F)
- .Cases("{f5}", "{ft5}", RISCV::F5_F)
- .Cases("{f6}", "{ft6}", RISCV::F6_F)
- .Cases("{f7}", "{ft7}", RISCV::F7_F)
- .Cases("{f8}", "{fs0}", RISCV::F8_F)
- .Cases("{f9}", "{fs1}", RISCV::F9_F)
- .Cases("{f10}", "{fa0}", RISCV::F10_F)
- .Cases("{f11}", "{fa1}", RISCV::F11_F)
- .Cases("{f12}", "{fa2}", RISCV::F12_F)
- .Cases("{f13}", "{fa3}", RISCV::F13_F)
- .Cases("{f14}", "{fa4}", RISCV::F14_F)
- .Cases("{f15}", "{fa5}", RISCV::F15_F)
- .Cases("{f16}", "{fa6}", RISCV::F16_F)
- .Cases("{f17}", "{fa7}", RISCV::F17_F)
- .Cases("{f18}", "{fs2}", RISCV::F18_F)
- .Cases("{f19}", "{fs3}", RISCV::F19_F)
- .Cases("{f20}", "{fs4}", RISCV::F20_F)
- .Cases("{f21}", "{fs5}", RISCV::F21_F)
- .Cases("{f22}", "{fs6}", RISCV::F22_F)
- .Cases("{f23}", "{fs7}", RISCV::F23_F)
- .Cases("{f24}", "{fs8}", RISCV::F24_F)
- .Cases("{f25}", "{fs9}", RISCV::F25_F)
- .Cases("{f26}", "{fs10}", RISCV::F26_F)
- .Cases("{f27}", "{fs11}", RISCV::F27_F)
- .Cases("{f28}", "{ft8}", RISCV::F28_F)
- .Cases("{f29}", "{ft9}", RISCV::F29_F)
- .Cases("{f30}", "{ft10}", RISCV::F30_F)
- .Cases("{f31}", "{ft11}", RISCV::F31_F)
+ .Cases({"{f0}", "{ft0}"}, RISCV::F0_F)
+ .Cases({"{f1}", "{ft1}"}, RISCV::F1_F)
+ .Cases({"{f2}", "{ft2}"}, RISCV::F2_F)
+ .Cases({"{f3}", "{ft3}"}, RISCV::F3_F)
+ .Cases({"{f4}", "{ft4}"}, RISCV::F4_F)
+ .Cases({"{f5}", "{ft5}"}, RISCV::F5_F)
+ .Cases({"{f6}", "{ft6}"}, RISCV::F6_F)
+ .Cases({"{f7}", "{ft7}"}, RISCV::F7_F)
+ .Cases({"{f8}", "{fs0}"}, RISCV::F8_F)
+ .Cases({"{f9}", "{fs1}"}, RISCV::F9_F)
+ .Cases({"{f10}", "{fa0}"}, RISCV::F10_F)
+ .Cases({"{f11}", "{fa1}"}, RISCV::F11_F)
+ .Cases({"{f12}", "{fa2}"}, RISCV::F12_F)
+ .Cases({"{f13}", "{fa3}"}, RISCV::F13_F)
+ .Cases({"{f14}", "{fa4}"}, RISCV::F14_F)
+ .Cases({"{f15}", "{fa5}"}, RISCV::F15_F)
+ .Cases({"{f16}", "{fa6}"}, RISCV::F16_F)
+ .Cases({"{f17}", "{fa7}"}, RISCV::F17_F)
+ .Cases({"{f18}", "{fs2}"}, RISCV::F18_F)
+ .Cases({"{f19}", "{fs3}"}, RISCV::F19_F)
+ .Cases({"{f20}", "{fs4}"}, RISCV::F20_F)
+ .Cases({"{f21}", "{fs5}"}, RISCV::F21_F)
+ .Cases({"{f22}", "{fs6}"}, RISCV::F22_F)
+ .Cases({"{f23}", "{fs7}"}, RISCV::F23_F)
+ .Cases({"{f24}", "{fs8}"}, RISCV::F24_F)
+ .Cases({"{f25}", "{fs9}"}, RISCV::F25_F)
+ .Cases({"{f26}", "{fs10}"}, RISCV::F26_F)
+ .Cases({"{f27}", "{fs11}"}, RISCV::F27_F)
+ .Cases({"{f28}", "{ft8}"}, RISCV::F28_F)
+ .Cases({"{f29}", "{ft9}"}, RISCV::F29_F)
+ .Cases({"{f30}", "{ft10}"}, RISCV::F30_F)
+ .Cases({"{f31}", "{ft11}"}, RISCV::F31_F)
.Default(RISCV::NoRegister);
if (FReg != RISCV::NoRegister) {
assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 3a7013d..c9df787 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -869,7 +869,7 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction &MF, MachineInstr &MI,
}
}
-// This is the version used during inline spilling
+// This is the version used during InlineSpiller::spillAroundUses
MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 7c89686..9cb53fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">;
def BLTU : BranchCC_rri<0b110, "bltu">;
def BGEU : BranchCC_rri<0b111, "bgeu">;
-let IsSignExtendingOpW = 1 in {
+let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in {
def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
@@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
/// RV64I instructions
let Predicates = [IsRV64] in {
+let canFoldAsLoad = 1 in {
def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>;
def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
+}
def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>;
let IsSignExtendingOpW = 1 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index afac37d..4ffe3e6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt];
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtD] in {
+let canFoldAsLoad = 1 in
def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
// Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 6571d99..b30f8ec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -330,6 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtF] in {
+let canFoldAsLoad = 1 in
def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
// Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index e9f43b9..84bb294 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs);
Register VLENB = 0;
- unsigned PreHandledNum = 0;
+ unsigned VLENBShift = 0;
+ unsigned PrevHandledNum = 0;
unsigned I = 0;
while (I != NumRegs) {
auto [LMulHandled, RegClass, Opcode] =
getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill);
auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled);
bool IsLast = I + RegNumHandled == NumRegs;
- if (PreHandledNum) {
+ if (PrevHandledNum) {
Register Step;
// Optimize for constant VLEN.
if (auto VLEN = STI.getRealVLen()) {
- int64_t Offset = *VLEN / 8 * PreHandledNum;
+ int64_t Offset = *VLEN / 8 * PrevHandledNum;
Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset);
} else {
@@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB);
}
- uint32_t ShiftAmount = Log2_32(PreHandledNum);
- if (ShiftAmount == 0)
- Step = VLENB;
- else {
- Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step)
- .addReg(VLENB, getKillRegState(IsLast))
- .addImm(ShiftAmount);
+ uint32_t ShiftAmount = Log2_32(PrevHandledNum);
+ // To avoid using an extra register, we shift the VLENB register and
+ // remember how much it has been shifted. We can then use relative
+ // shifts to adjust to the desired shift amount.
+ if (VLENBShift > ShiftAmount) {
+ BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB)
+ .addReg(VLENB, RegState::Kill)
+ .addImm(VLENBShift - ShiftAmount);
+ } else if (VLENBShift < ShiftAmount) {
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB)
+ .addReg(VLENB, RegState::Kill)
+ .addImm(ShiftAmount - VLENBShift);
}
+ VLENBShift = ShiftAmount;
+ Step = VLENB;
}
BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase)
@@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
if (IsSpill)
MIB.addReg(Reg, RegState::Implicit);
- PreHandledNum = RegNumHandled;
+ PrevHandledNum = RegNumHandled;
RegEncoding += RegNumHandled;
I += RegNumHandled;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3fea21e..3f0424f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3151,6 +3151,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectInsertElt(ResVReg, ResType, I);
case Intrinsic::spv_gep:
return selectGEP(ResVReg, ResType, I);
+ case Intrinsic::spv_bitcast: {
+ Register OpReg = I.getOperand(2).getReg();
+ SPIRVType *OpType =
+ OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr;
+ if (!GR.isBitcastCompatible(ResType, OpType))
+ report_fatal_error("incompatible result and operand types in a bitcast");
+ return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast);
+ }
case Intrinsic::spv_unref_global:
case Intrinsic::spv_init_global: {
MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg());
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 6e444c9..65dffc7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass {
// Returns the loaded value.
Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType,
FixedVectorType *TargetType, Value *Source) {
- assert(TargetType->getNumElements() <= SourceType->getNumElements());
LoadInst *NewLoad = B.CreateLoad(SourceType, Source);
buildAssignType(B, SourceType, NewLoad);
Value *AssignValue = NewLoad;
if (TargetType->getElementType() != SourceType->getElementType()) {
+ const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout();
+ [[maybe_unused]] TypeSize TargetTypeSize =
+ DL.getTypeSizeInBits(TargetType);
+ [[maybe_unused]] TypeSize SourceTypeSize =
+ DL.getTypeSizeInBits(SourceType);
+ assert(TargetTypeSize == SourceTypeSize);
AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast,
{TargetType, SourceType}, {NewLoad});
buildAssignType(B, TargetType, AssignValue);
+ return AssignValue;
}
+ assert(TargetType->getNumElements() < SourceType->getNumElements());
SmallVector<int> Mask(/* Size= */ TargetType->getNumElements());
for (unsigned I = 0; I < TargetType->getNumElements(); ++I)
Mask[I] = I;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index f7cdfcb..db036a5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -613,8 +613,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
<< FinalFlags << "\n";
MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI);
MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2);
- OrigFlagsOp =
- MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags));
+ OrigFlagsOp = MachineOperand::CreateImm(FinalFlags);
return; // Merge done, so we found a duplicate; don't add it to MAI.MS
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index db6f2d6..d538009 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB,
.addUse(OpReg);
}
-// We do instruction selections early instead of calling MIB.buildBitcast()
-// generating the general op code G_BITCAST. When MachineVerifier validates
-// G_BITCAST we see a check of a kind: if Source Type is equal to Destination
-// Type then report error "bitcast must change the type". This doesn't take into
-// account the notion of a typed pointer that is important for SPIR-V where a
-// user may and should use bitcast between pointers with different pointee types
-// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast).
-// It's important for correct lowering in SPIR-V, because interpretation of the
-// data type is not left to instructions that utilize the pointer, but encoded
-// by the pointer declaration, and the SPIRV target can and must handle the
-// declaration and use of pointers that specify the type of data they point to.
-// It's not feasible to improve validation of G_BITCAST using just information
-// provided by low level types of source and destination. Therefore we don't
-// produce G_BITCAST as the general op code with semantics different from
-// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only
-// difference would be that CombinerHelper couldn't transform known patterns
-// around G_BUILD_VECTOR. See discussion
-// in https://github.com/llvm/llvm-project/pull/110270 for even more context.
-static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
- MachineIRBuilder MIB) {
+// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error.
+// The verifier checks if the source and destination LLTs of a G_BITCAST are
+// different, but this check is too strict for SPIR-V's typed pointers, which
+// may have the same LLT but different SPIRVType (e.g. pointers to different
+// pointee types). By lowering to OpBitcast here, we bypass the verifier's
+// check. See discussion in https://github.com/llvm/llvm-project/pull/110270
+// for more context.
+//
+// We also handle the llvm.spv.bitcast intrinsic here. If the source and
+// destination SPIR-V types are the same, we lower it to a COPY to enable
+// further optimizations like copy propagation.
+static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+ MachineIRBuilder MIB) {
SmallVector<MachineInstr *, 16> ToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
+ if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
+ SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg);
+ assert(
+ DstType &&
+ "Expected destination SPIR-V type to have been assigned already.");
+ SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg);
+ assert(SrcType &&
+ "Expected source SPIR-V type to have been assigned already.");
+ if (DstType == SrcType) {
+ MIB.setInsertPt(*MI.getParent(), MI);
+ MIB.buildCopy(DstReg, SrcReg);
+ ToErase.push_back(&MI);
+ continue;
+ }
+ }
+
if (MI.getOpcode() != TargetOpcode::G_BITCAST)
continue;
+
MIB.setInsertPt(*MI.getParent(), MI);
buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(),
MI.getOperand(1).getReg());
@@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
SmallVector<MachineInstr *, 10> ToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) &&
- !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
+ if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
continue;
assert(MI.getOperand(2).isReg());
MIB.setInsertPt(*MI.getParent(), MI);
ToErase.push_back(&MI);
- if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
- MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
- continue;
- }
Register Def = MI.getOperand(0).getReg();
Register Source = MI.getOperand(2).getReg();
Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0);
@@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
removeImplicitFallthroughs(MF, MIB);
insertSpirvDecorations(MF, GR, MIB);
insertInlineAsm(MF, GR, ST, MIB);
- selectOpBitcasts(MF, GR, MIB);
+ lowerBitcasts(MF, GR, MIB);
return true;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index ff4d6469..ee575e3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -207,8 +207,7 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> {
template <> struct CustomMappingTraits<BBNumberMap> {
static void inputOne(IO &YamlIO, StringRef Key,
BBNumberMap &SrcToUnwindDest) {
- YamlIO.mapRequired(Key.str().c_str(),
- SrcToUnwindDest[std::atoi(Key.str().c_str())]);
+ YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]);
}
static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 127ee67..bac3692 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1121,7 +1121,7 @@ private:
void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
};
- bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt,
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = {},
bool MatchingInlineAsm = false) {
MCAsmParser &Parser = getParser();
if (MatchingInlineAsm) {
@@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
// Report back its kind, or IOK_INVALID if does not evaluated as a known one
unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
return StringSwitch<unsigned>(Name)
- .Cases("TYPE","type",IOK_TYPE)
- .Cases("SIZE","size",IOK_SIZE)
- .Cases("LENGTH","length",IOK_LENGTH)
- .Default(IOK_INVALID);
+ .Cases({"TYPE", "type"}, IOK_TYPE)
+ .Cases({"SIZE", "size"}, IOK_SIZE)
+ .Cases({"LENGTH", "length"}, IOK_LENGTH)
+ .Default(IOK_INVALID);
}
/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
@@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("type", MOK_TYPE)
- .Cases("size", "sizeof", MOK_SIZEOF)
- .Cases("length", "lengthof", MOK_LENGTHOF)
+ .Cases({"size", "sizeof"}, MOK_SIZEOF)
+ .Cases({"length", "lengthof"}, MOK_LENGTHOF)
.Default(MOK_INVALID);
}
@@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
StringRef *SizeStr) {
Size = StringSwitch<unsigned>(getTok().getString())
- .Cases("BYTE", "byte", 8)
- .Cases("WORD", "word", 16)
- .Cases("DWORD", "dword", 32)
- .Cases("FLOAT", "float", 32)
- .Cases("LONG", "long", 32)
- .Cases("FWORD", "fword", 48)
- .Cases("DOUBLE", "double", 64)
- .Cases("QWORD", "qword", 64)
- .Cases("MMWORD","mmword", 64)
- .Cases("XWORD", "xword", 80)
- .Cases("TBYTE", "tbyte", 80)
- .Cases("XMMWORD", "xmmword", 128)
- .Cases("YMMWORD", "ymmword", 256)
- .Cases("ZMMWORD", "zmmword", 512)
- .Default(0);
+ .Cases({"BYTE", "byte"}, 8)
+ .Cases({"WORD", "word"}, 16)
+ .Cases({"DWORD", "dword"}, 32)
+ .Cases({"FLOAT", "float"}, 32)
+ .Cases({"LONG", "long"}, 32)
+ .Cases({"FWORD", "fword"}, 48)
+ .Cases({"DOUBLE", "double"}, 64)
+ .Cases({"QWORD", "qword"}, 64)
+ .Cases({"MMWORD", "mmword"}, 64)
+ .Cases({"XWORD", "xword"}, 80)
+ .Cases({"TBYTE", "tbyte"}, 80)
+ .Cases({"XMMWORD", "xmmword"}, 128)
+ .Cases({"YMMWORD", "ymmword"}, 256)
+ .Cases({"ZMMWORD", "zmmword"}, 512)
+ .Default(0);
if (Size) {
if (SizeStr)
*SizeStr = getTok().getString();
@@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
// otherwise the EFLAGS Condition Code enumerator.
X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
return StringSwitch<X86::CondCode>(CC)
- .Case("o", X86::COND_O) // Overflow
- .Case("no", X86::COND_NO) // No Overflow
- .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal
- .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
- .Cases("e", "z", X86::COND_E) // Equal/Zero
- .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
- .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
- .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal
- .Case("s", X86::COND_S) // Sign
- .Case("ns", X86::COND_NS) // No Sign
- .Cases("p", "pe", X86::COND_P) // Parity/Parity Even
- .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
- .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal
- .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
- .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
- .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal
+ .Case("o", X86::COND_O) // Overflow
+ .Case("no", X86::COND_NO) // No Overflow
+ .Cases({"b", "nae"}, X86::COND_B) // Below/Neither Above nor Equal
+ .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below
+ .Cases({"e", "z"}, X86::COND_E) // Equal/Zero
+ .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero
+ .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above
+ .Cases({"a", "nbe"}, X86::COND_A) // Above/Neither Below nor Equal
+ .Case("s", X86::COND_S) // Sign
+ .Case("ns", X86::COND_NS) // No Sign
+ .Cases({"p", "pe"}, X86::COND_P) // Parity/Parity Even
+ .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd
+ .Cases({"l", "nge"}, X86::COND_L) // Less/Neither Greater nor Equal
+ .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less
+ .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater
+ .Cases({"g", "nle"}, X86::COND_G) // Greater/Neither Less nor Equal
.Default(X86::COND_INVALID);
}
@@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction(
SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- SMRange EmptyRange = std::nullopt;
+ SMRange EmptyRange;
// In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
// when matching the instruction.
if (ForcedDataPrefix == X86::Is32Bit)
@@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction(
SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- SMRange EmptyRange = std::nullopt;
+ SMRange EmptyRange;
// Find one unsized memory operand, if present.
X86Operand *UnsizedMemOp = nullptr;
for (const auto &Op : Operands) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 89ac53e..a922725 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand {
Inst.addOperand(MCOperand::createReg(Reg));
}
- bool isTILEPair() const {
- return Kind == Register &&
- X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());
- }
-
- void addTILEPairOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- MCRegister Reg = getReg();
- switch (Reg.id()) {
- default:
- llvm_unreachable("Invalid tile register!");
- case X86::TMM0:
- case X86::TMM1:
- Reg = X86::TMM0_TMM1;
- break;
- case X86::TMM2:
- case X86::TMM3:
- Reg = X86::TMM2_TMM3;
- break;
- case X86::TMM4:
- case X86::TMM5:
- Reg = X86::TMM4_TMM5;
- break;
- case X86::TMM6:
- case X86::TMM7:
- Reg = X86::TMM6_TMM7;
- break;
- }
- Inst.addOperand(MCOperand::createReg(Reg));
- }
-
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
if (getMemBaseReg())
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4927b45..7d2b5eb 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) {
if (index > 7) \
*valid = 0; \
return prefix##_TMM0 + index; \
- case TYPE_TMM_PAIR: \
- if (index > 7) \
- *valid = 0; \
- return prefix##_TMM0_TMM1 + (index / 2); \
case TYPE_VK: \
index &= 0xf; \
if (index > 7) \
@@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_YMM:
case TYPE_ZMM:
case TYPE_TMM:
- case TYPE_TMM_PAIR:
case TYPE_VK_PAIR:
case TYPE_VK:
case TYPE_DEBUGREG:
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index dc9af2c..b0aa70b 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -535,12 +535,6 @@ namespace X86Disassembler {
ENTRY(TMM6) \
ENTRY(TMM7)
-#define REGS_TMM_PAIRS \
- ENTRY(TMM0_TMM1) \
- ENTRY(TMM2_TMM3) \
- ENTRY(TMM4_TMM5) \
- ENTRY(TMM6_TMM7)
-
#define ALL_EA_BASES \
EA_BASES_16BIT \
EA_BASES_32BIT \
@@ -565,7 +559,6 @@ namespace X86Disassembler {
REGS_DEBUG \
REGS_CONTROL \
REGS_TMM \
- REGS_TMM_PAIRS \
ENTRY(RIP)
/// All possible values of the base field for effective-address
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 1c5f166..759d95e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
}
llvm_unreachable("Unknown mask pair register name");
}
-
-void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo,
- raw_ostream &OS) {
- switch (MI->getOperand(OpNo).getReg()) {
- case X86::TMM0_TMM1:
- printRegName(OS, X86::TMM0);
- return;
- case X86::TMM2_TMM3:
- printRegName(OS, X86::TMM2);
- return;
- case X86::TMM4_TMM5:
- printRegName(OS, X86::TMM4);
- return;
- case X86::TMM6_TMM7:
- printRegName(OS, X86::TMM6);
- return;
- }
- llvm_unreachable("Unknown mask pair register name");
-}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 2c9467c..cb55f2f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -40,7 +40,6 @@ protected:
const MCSubtargetInfo &STI);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a1fd366..9e291a6 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
"Support AMX-MOVRS instructions",
[FeatureAMXTILE]>;
-def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
- "Support AMX amx-transpose instructions",
- [FeatureAMXTILE]>;
def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512",
"HasAMXAVX512", "true",
"Support AMX-AVX512 instructions",
@@ -1177,8 +1174,7 @@ def ProcessorFeatures {
FeatureAMXMOVRS,
FeatureAMXAVX512,
FeatureAMXFP8,
- FeatureAMXTF32,
- FeatureAMXTRANSPOSE];
+ FeatureAMXTF32];
list<SubtargetFeature> DMRFeatures =
!listconcat(GNRDFeatures, DMRAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4a9b824..e3c44c0 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(Opc));
return true;
}
- // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
- // AMX instruction to support it. So, split it to 2 load instructions:
- // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
- // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
- // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
- case X86::PTILEPAIRLOAD: {
- int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
- Register TReg = MBBI->getOperand(0).getReg();
- bool DstIsDead = MBBI->getOperand(0).isDead();
- Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
- Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
- unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
- MachineInstrBuilder MIBLo =
- BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
- .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
- MachineInstrBuilder MIBHi =
- BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
- .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
-
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- MIBLo.add(MBBI->getOperand(1 + i));
- if (i == X86::AddrDisp)
- MIBHi.addImm(Disp + TmmSize);
- else
- MIBHi.add(MBBI->getOperand(1 + i));
- }
-
- // Make sure the first stride reg used in first tileload is alive.
- MachineOperand &Stride =
- MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
- Stride.setIsKill(false);
-
- // Split the memory operand, adjusting the offset and size for the halves.
- MachineMemOperand *OldMMO = MBBI->memoperands().front();
- MachineFunction *MF = MBB.getParent();
- MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
- MachineMemOperand *MMOHi =
- MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
- MIBLo.setMemRefs(MMOLo);
- MIBHi.setMemRefs(MMOHi);
-
- // Delete the pseudo.
- MBB.erase(MBBI);
- return true;
- }
- // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
- // corresponding AMX instruction to support it. So, split it too:
- // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
- // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
- // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1"
- case X86::PTILEPAIRSTORE: {
- int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
- Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg();
- bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
- Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
- Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
- unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
- MachineInstrBuilder MIBLo =
- BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
- MachineInstrBuilder MIBHi =
- BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
-
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- MIBLo.add(MBBI->getOperand(i));
- if (i == X86::AddrDisp)
- MIBHi.addImm(Disp + TmmSize);
- else
- MIBHi.add(MBBI->getOperand(i));
- }
- MIBLo.addReg(TReg0, getKillRegState(SrcIsKill));
- MIBHi.addReg(TReg1, getKillRegState(SrcIsKill));
-
- // Make sure the first stride reg used in first tilestore is alive.
- MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg);
- Stride.setIsKill(false);
-
- // Split the memory operand, adjusting the offset and size for the halves.
- MachineMemOperand *OldMMO = MBBI->memoperands().front();
- MachineFunction *MF = MBB.getParent();
- MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
- MachineMemOperand *MMOHi =
- MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
- MIBLo.setMemRefs(MMOLo);
- MIBHi.setMemRefs(MMOHi);
-
- // Delete the pseudo.
- MBB.erase(MBBI);
- return true;
- }
- case X86::PT2RPNTLVWZ0V:
- case X86::PT2RPNTLVWZ0T1V:
- case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V:
- case X86::PT2RPNTLVWZ0RSV:
- case X86::PT2RPNTLVWZ0RST1V:
- case X86::PT2RPNTLVWZ1RSV:
- case X86::PT2RPNTLVWZ1RST1V: {
- for (unsigned i = 3; i > 0; --i)
- MI.removeOperand(i);
- unsigned Opc;
- switch (Opcode) {
- case X86::PT2RPNTLVWZ0V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
- break;
- case X86::PT2RPNTLVWZ0T1V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
- break;
- case X86::PT2RPNTLVWZ1V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
- break;
- case X86::PT2RPNTLVWZ1T1V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
- break;
- case X86::PT2RPNTLVWZ0RSV:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
- break;
- case X86::PT2RPNTLVWZ0RST1V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
- break;
- case X86::PT2RPNTLVWZ1RSV:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
- break;
- case X86::PT2RPNTLVWZ1RST1V:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
- break;
- default:
- llvm_unreachable("Impossible Opcode!");
- }
- MI.setDesc(TII->get(Opc));
- return true;
- }
- case X86::PTTRANSPOSEDV:
- case X86::PTCONJTFP16V: {
- for (int i = 2; i > 0; --i)
- MI.removeOperand(i);
- MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED
- : X86::TCONJTFP16));
- return true;
- }
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
case X86::PTDPBSSDV:
@@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PTDPBUUDV:
case X86::PTDPBF16PSV:
case X86::PTDPFP16PSV:
- case X86::PTTDPBF16PSV:
- case X86::PTTDPFP16PSV:
- case X86::PTTCMMIMFP16PSV:
- case X86::PTTCMMRLFP16PSV:
- case X86::PTCONJTCMMIMFP16PSV:
case X86::PTMMULTF32PSV:
- case X86::PTTMMULTF32PSV:
case X86::PTDPBF8PSV:
case X86::PTDPBHF8PSV:
case X86::PTDPHBF8PSV:
@@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.removeOperand(i);
unsigned Opc;
switch (Opcode) {
+ // clang-format off
case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break;
case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break;
case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break;
@@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break;
case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break;
- case X86::PTTDPBF16PSV:
- Opc = X86::TTDPBF16PS;
- break;
- case X86::PTTDPFP16PSV:
- Opc = X86::TTDPFP16PS;
- break;
- case X86::PTTCMMIMFP16PSV:
- Opc = X86::TTCMMIMFP16PS;
- break;
- case X86::PTTCMMRLFP16PSV:
- Opc = X86::TTCMMRLFP16PS;
- break;
- case X86::PTCONJTCMMIMFP16PSV:
- Opc = X86::TCONJTCMMIMFP16PS;
- break;
- case X86::PTMMULTF32PSV:
- Opc = X86::TMMULTF32PS;
- break;
- case X86::PTTMMULTF32PSV:
- Opc = X86::TTMMULTF32PS;
- break;
- case X86::PTDPBF8PSV:
- Opc = X86::TDPBF8PS;
- break;
- case X86::PTDPBHF8PSV:
- Opc = X86::TDPBHF8PS;
- break;
- case X86::PTDPHBF8PSV:
- Opc = X86::TDPHBF8PS;
- break;
- case X86::PTDPHF8PSV:
- Opc = X86::TDPHF8PS;
- break;
-
+ case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break;
+ case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break;
+ case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break;
+ case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break;
+ case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break;
+ // clang-format on
default:
llvm_unreachable("Unexpected Opcode");
}
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 787b71d..06f729a 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -267,24 +267,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
<< printReg(TileReg, TRI) << '\n');
}
-static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
- if (Reg.isVirtual()) {
- unsigned RegClassID = MRI->getRegClass(Reg)->getID();
- if (RegClassID == X86::TILERegClassID)
- return 1;
- if (RegClassID == X86::TILEPAIRRegClassID)
- return 2;
- } else {
- if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return 1;
- if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
- return 2;
+static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) {
+ if (Reg.isVirtual() &&
+ (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) {
+ return true;
}
- return 0;
-}
-static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
- return getTileDefNum(MRI, VirtReg) > 0;
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return true;
+
+ return false;
}
static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
@@ -296,7 +288,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
if (!MO.isReg())
return false;
- return getTileDefNum(MRI, MO.getReg()) > 0;
+ return isTileRegister(MRI, MO.getReg());
}
static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
@@ -636,19 +628,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
else if (dominates(MBB, LastShapeMI, ColMI))
LastShapeMI = ColMI;
}
- unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
- if (TileDefNum > 1) {
- for (unsigned I = 1; I < TileDefNum; I++) {
- MachineOperand *ColxMO = &MI.getOperand(2 + I);
- MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
- if (ColxMI->getParent() == &MBB) {
- if (!LastShapeMI)
- LastShapeMI = ColxMI;
- else if (dominates(MBB, LastShapeMI, ColxMI))
- LastShapeMI = ColxMI;
- }
- }
- }
+
// If there is user live out of the tilecfg, spill it and reload in
// before the user.
Register TileReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 11d331b..d86ae36 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
"Fast Tile Register Configure", false, false)
-static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
// There is no phi instruction after register allocation.
assert(MI.isPHI() == false);
// The instruction must have 3 operands: tile def, row, col.
// It should be AMX pseudo instruction that have shape operand.
if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
!MI.isPseudo())
- return 0;
+ return false;
MachineOperand &MO = MI.getOperand(0);
if (MO.isReg()) {
@@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
// register is not rewritten yet.
if (Reg.isVirtual()) {
if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
- return 1;
- if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID)
- return 2;
+ return true;
}
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return 1;
- if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
- return 2;
+ return true;
}
- return 0;
+ return false;
}
static unsigned getTMMIndex(Register Reg) {
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
return Reg - X86::TMM0;
- if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
- return (Reg - X86::TMM0_TMM1) * 2;
llvm_unreachable("Invalid Tmm Reg!");
}
@@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
bool Change = false;
SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
for (MachineInstr &MI : reverse(MBB)) {
- unsigned DefNum = getNumDefTiles(MRI, MI);
- if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV)
+ if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
continue;
// AMX instructions that define tile register.
if (MI.getOpcode() != X86::PLDTILECFGV) {
MachineOperand &Row = MI.getOperand(1);
unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg());
- for (unsigned I = 0; I < DefNum; I++) {
- MachineOperand &Col = MI.getOperand(2 + I);
- ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)});
- }
+ MachineOperand &Col = MI.getOperand(2);
+ ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
} else { // PLDTILECFGV
// Rewrite the shape information to memory. Stack slot should have
// been initialized to zero in pre config.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4393f6e..d4418c8 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -337,23 +337,8 @@ namespace {
// lowering but before ISEL.
bool isAMXSDNode(SDNode *N) const {
// Check if N is AMX SDNode:
- // 1. check specific opcode since these carry MVT::Untyped instead of
- // x86amx_type;
- // 2. check result type;
- // 3. check operand type;
- switch (N->getOpcode()) {
- default:
- break;
- case X86::PT2RPNTLVWZ0V:
- case X86::PT2RPNTLVWZ0T1V:
- case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V:
- case X86::PT2RPNTLVWZ0RSV:
- case X86::PT2RPNTLVWZ0RST1V:
- case X86::PT2RPNTLVWZ1RSV:
- case X86::PT2RPNTLVWZ1RST1V:
- return true;
- }
+ // 1. check result type;
+ // 2. check operand type;
for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
if (N->getValueType(Idx) == MVT::x86amx)
return true;
@@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
- case Intrinsic::x86_t2rpntlvwz0rs:
- case Intrinsic::x86_t2rpntlvwz0rst1:
- case Intrinsic::x86_t2rpntlvwz1rs:
- case Intrinsic::x86_t2rpntlvwz1rst1:
- if (!Subtarget->hasAMXMOVRS())
- break;
- [[fallthrough]];
- case Intrinsic::x86_t2rpntlvwz0:
- case Intrinsic::x86_t2rpntlvwz0t1:
- case Intrinsic::x86_t2rpntlvwz1:
- case Intrinsic::x86_t2rpntlvwz1t1: {
- if (!Subtarget->hasAMXTRANSPOSE())
- break;
- auto *MFI =
- CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
- MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
- unsigned Opc;
- switch (IntNo) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_t2rpntlvwz0:
- Opc = X86::PT2RPNTLVWZ0;
- break;
- case Intrinsic::x86_t2rpntlvwz0t1:
- Opc = X86::PT2RPNTLVWZ0T1;
- break;
- case Intrinsic::x86_t2rpntlvwz1:
- Opc = X86::PT2RPNTLVWZ1;
- break;
- case Intrinsic::x86_t2rpntlvwz1t1:
- Opc = X86::PT2RPNTLVWZ1T1;
- break;
- case Intrinsic::x86_t2rpntlvwz0rs:
- Opc = X86::PT2RPNTLVWZ0RS;
- break;
- case Intrinsic::x86_t2rpntlvwz0rst1:
- Opc = X86::PT2RPNTLVWZ0RST1;
- break;
- case Intrinsic::x86_t2rpntlvwz1rs:
- Opc = X86::PT2RPNTLVWZ1RS;
- break;
- case Intrinsic::x86_t2rpntlvwz1rst1:
- Opc = X86::PT2RPNTLVWZ1RST1;
- break;
- }
- // FIXME: Match displacement and scale.
- unsigned TIndex = Node->getConstantOperandVal(2);
- SDValue TReg = getI8Imm(TIndex, dl);
- SDValue Base = Node->getOperand(3);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(4);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue Chain = Node->getOperand(0);
- SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
- MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
}
break;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beada..133406b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22861,6 +22861,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();
+ // Don't do this if we're not supposed to use the FPU.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+ return SDValue();
+
// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
@@ -22883,13 +22890,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
- bool NoImplicitFloatOps =
- DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX()) ||
- (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
+ (OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -27946,67 +27949,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
- case Intrinsic::x86_t2rpntlvwz0rs_internal:
- case Intrinsic::x86_t2rpntlvwz0rst1_internal:
- case Intrinsic::x86_t2rpntlvwz1rs_internal:
- case Intrinsic::x86_t2rpntlvwz1rst1_internal:
- case Intrinsic::x86_t2rpntlvwz0_internal:
- case Intrinsic::x86_t2rpntlvwz0t1_internal:
- case Intrinsic::x86_t2rpntlvwz1_internal:
- case Intrinsic::x86_t2rpntlvwz1t1_internal: {
- auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
- X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
- unsigned IntNo = Op.getConstantOperandVal(1);
- unsigned Opc = 0;
- switch (IntNo) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_t2rpntlvwz0_internal:
- Opc = X86::PT2RPNTLVWZ0V;
- break;
- case Intrinsic::x86_t2rpntlvwz0t1_internal:
- Opc = X86::PT2RPNTLVWZ0T1V;
- break;
- case Intrinsic::x86_t2rpntlvwz1_internal:
- Opc = X86::PT2RPNTLVWZ1V;
- break;
- case Intrinsic::x86_t2rpntlvwz1t1_internal:
- Opc = X86::PT2RPNTLVWZ1T1V;
- break;
- case Intrinsic::x86_t2rpntlvwz0rs_internal:
- Opc = X86::PT2RPNTLVWZ0RSV;
- break;
- case Intrinsic::x86_t2rpntlvwz0rst1_internal:
- Opc = X86::PT2RPNTLVWZ0RST1V;
- break;
- case Intrinsic::x86_t2rpntlvwz1rs_internal:
- Opc = X86::PT2RPNTLVWZ1RSV;
- break;
- case Intrinsic::x86_t2rpntlvwz1rst1_internal:
- Opc = X86::PT2RPNTLVWZ1RST1V;
- break;
- }
-
- SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
-
- SDValue Ops[] = {Op.getOperand(2), // Row
- Op.getOperand(3), // Col0
- Op.getOperand(4), // Col1
- Op.getOperand(5), // Base
- DAG.getTargetConstant(1, DL, MVT::i8), // Scale
- Op.getOperand(6), // Index
- DAG.getTargetConstant(0, DL, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i16), // Segment
- Op.getOperand(0)}; // Chain
-
- MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
- SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
- SDValue(Res, 0));
- SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
- SDValue(Res, 0));
- return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
- }
case Intrinsic::x86_atomic_bts_rm:
case Intrinsic::x86_atomic_btc_rm:
case Intrinsic::x86_atomic_btr_rm: {
@@ -37745,10 +37687,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
assert (Imm < 8 && "Illegal tmm index");
return X86::TMM0 + Imm;
};
- auto TMMImmToTMMPair = [](unsigned Imm) {
- assert(Imm < 8 && "Illegal tmm pair index.");
- return X86::TMM0_TMM1 + Imm / 2;
- };
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected instr type to insert");
@@ -38129,53 +38067,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTDPBHF8PS:
case X86::PTDPHBF8PS:
case X86::PTDPHF8PS:
- case X86::PTTDPBF16PS:
- case X86::PTTDPFP16PS:
- case X86::PTTCMMIMFP16PS:
- case X86::PTTCMMRLFP16PS:
- case X86::PTCONJTCMMIMFP16PS:
- case X86::PTMMULTF32PS:
- case X86::PTTMMULTF32PS: {
+ case X86::PTMMULTF32PS: {
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
+ // clang-format off
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
- case X86::PTCMMIMFP16PS:
- Opc = X86::TCMMIMFP16PS;
- break;
- case X86::PTCMMRLFP16PS:
- Opc = X86::TCMMRLFP16PS;
- break;
+ case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
+ case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
- case X86::PTTDPBF16PS:
- Opc = X86::TTDPBF16PS;
- break;
- case X86::PTTDPFP16PS:
- Opc = X86::TTDPFP16PS;
- break;
- case X86::PTTCMMIMFP16PS:
- Opc = X86::TTCMMIMFP16PS;
- break;
- case X86::PTTCMMRLFP16PS:
- Opc = X86::TTCMMRLFP16PS;
- break;
- case X86::PTCONJTCMMIMFP16PS:
- Opc = X86::TCONJTCMMIMFP16PS;
- break;
- case X86::PTMMULTF32PS:
- Opc = X86::TMMULTF32PS;
- break;
- case X86::PTTMMULTF32PS:
- Opc = X86::TTMMULTF32PS;
- break;
+ case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break;
+ // clang-format on
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -38246,70 +38156,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
- case X86::PT2RPNTLVWZ0:
- case X86::PT2RPNTLVWZ0T1:
- case X86::PT2RPNTLVWZ1:
- case X86::PT2RPNTLVWZ1T1:
- case X86::PT2RPNTLVWZ0RS:
- case X86::PT2RPNTLVWZ0RST1:
- case X86::PT2RPNTLVWZ1RS:
- case X86::PT2RPNTLVWZ1RST1: {
- const DebugLoc &DL = MI.getDebugLoc();
- unsigned Opc;
-#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("Unexpected instruction!");
- case X86::PT2RPNTLVWZ0:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
- break;
- case X86::PT2RPNTLVWZ0T1:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
- break;
- case X86::PT2RPNTLVWZ1:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
- break;
- case X86::PT2RPNTLVWZ1T1:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
- break;
- case X86::PT2RPNTLVWZ0RS:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
- break;
- case X86::PT2RPNTLVWZ0RST1:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
- break;
- case X86::PT2RPNTLVWZ1RS:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
- break;
- case X86::PT2RPNTLVWZ1RST1:
- Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
- break;
- }
-#undef GET_EGPR_IF_ENABLED
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
- MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
-
- MIB.add(MI.getOperand(1)); // base
- MIB.add(MI.getOperand(2)); // scale
- MIB.add(MI.getOperand(3)); // index
- MIB.add(MI.getOperand(4)); // displacement
- MIB.add(MI.getOperand(5)); // segment
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
- }
- case X86::PTTRANSPOSED:
- case X86::PTCONJTFP16: {
- const DebugLoc &DL = MI.getDebugLoc();
- unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
- : X86::TCONJTFP16;
-
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
- }
case X86::PTCVTROWPS2BF16Hrri:
case X86::PTCVTROWPS2BF16Lrri:
case X86::PTCVTROWPS2PHHrri:
@@ -53501,80 +53347,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
-static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
- SDValue StoredVal = St->getValue();
- EVT VT = StoredVal.getValueType();
-
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
- VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
- return SDValue();
-
- // BTR: X & ~(1 << ShAmt)
- // BTS: X | (1 << ShAmt)
- // BTC: X ^ (1 << ShAmt)
- SDValue ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
- sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
- return SDValue();
-
- // Ensure the shift amount is in bounds.
- KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
- if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
- return SDValue();
-
- // Split the shift into an alignment shift that moves the active i32 block to
- // the bottom bits for truncation and a modulo shift that can act on the i32.
- EVT AmtVT = ShAmt.getValueType();
- SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getSignedConstant(-32LL, DL, AmtVT));
- SDValue ModuloAmt =
- DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
-
- // Compute the byte offset for the i32 block that is changed by the RMW.
- // combineTruncate will adjust the load for us in a similar way.
- EVT PtrVT = St->getBasePtr().getValueType();
- SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
- SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
- DAG.getShiftAmountConstant(3, PtrVT, DL));
- SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
- SDNodeFlags::NoUnsignedWrap);
-
- // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
- X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-
- SDValue Mask =
- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
-
- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
-}
-
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -53801,9 +53573,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
}
- if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
- return R;
-
// Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
// store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54591,6 +54360,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
+ using namespace SDPatternMatch;
if (!VT.isVector() || !Subtarget.hasSSSE3())
return SDValue();
@@ -54600,42 +54370,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
return SDValue();
SDValue SSatVal = detectSSatPattern(In, VT);
- if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
- return SDValue();
-
- // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
- // of multiplies from even/odd elements.
- SDValue N0 = SSatVal.getOperand(0);
- SDValue N1 = SSatVal.getOperand(1);
-
- if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+ if (!SSatVal)
return SDValue();
- SDValue N00 = N0.getOperand(0);
- SDValue N01 = N0.getOperand(1);
- SDValue N10 = N1.getOperand(0);
- SDValue N11 = N1.getOperand(1);
-
+ // See if this is a signed saturation of an ADD, adding pairs of multiplies
+ // from even/odd elements, from zero_extend/sign_extend operands.
+ //
// TODO: Handle constant vectors and use knownbits/computenumsignbits?
- // Canonicalize zero_extend to LHS.
- if (N01.getOpcode() == ISD::ZERO_EXTEND)
- std::swap(N00, N01);
- if (N11.getOpcode() == ISD::ZERO_EXTEND)
- std::swap(N10, N11);
-
- // Ensure we have a zero_extend and a sign_extend.
- if (N00.getOpcode() != ISD::ZERO_EXTEND ||
- N01.getOpcode() != ISD::SIGN_EXTEND ||
- N10.getOpcode() != ISD::ZERO_EXTEND ||
- N11.getOpcode() != ISD::SIGN_EXTEND)
+ SDValue N00, N01, N10, N11;
+ if (!sd_match(SSatVal,
+ m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))),
+ m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11))))))
return SDValue();
- // Peek through the extends.
- N00 = N00.getOperand(0);
- N01 = N01.getOperand(0);
- N10 = N10.getOperand(0);
- N11 = N11.getOperand(0);
-
// Ensure the extend is from vXi8.
if (N00.getValueType().getVectorElementType() != MVT::i8 ||
N01.getValueType().getVectorElementType() != MVT::i8 ||
@@ -54758,9 +54505,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
// truncation, see if we can convert the shift into a pointer offset instead.
// Limit this to normal (non-ext) scalar integer loads.
if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
- Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
- (Src.getOperand(0).hasOneUse() ||
- !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(Src.getOperand(0).getNode())) {
auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
if (Ld->isSimple() && VT.isByteSized() &&
isPowerOf2_64(VT.getSizeInBits())) {
@@ -54768,9 +54514,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
// Check the shift amount is byte aligned.
// Check the truncation doesn't use any shifted in (zero) top bits.
+ // Check the shift amount doesn't depend on the original load.
if (KnownAmt.countMinTrailingZeros() >= 3 &&
KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
- VT.getSizeInBits())) {
+ VT.getSizeInBits()) &&
+ !Ld->isPredecessorOf(ShAmt.getNode())) {
EVT PtrVT = Ld->getBasePtr().getValueType();
SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
SDValue PtrByteOfs =
@@ -56558,7 +56306,6 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- using namespace SDPatternMatch;
const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
const SDValue LHS = N->getOperand(0);
const SDValue RHS = N->getOperand(1);
@@ -56617,37 +56364,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
- // If we're performing a bit test on a larger than legal type, attempt
- // to (aligned) shift down the value to the bottom 32-bits and then
- // perform the bittest on the i32 value.
- // ICMP_ZERO(AND(X,SHL(1,IDX)))
- // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
- if (isNullConstant(RHS) &&
- OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
- SDValue X, ShAmt;
- if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
- m_Shl(m_One(), m_Value(ShAmt)))))) {
- // Only attempt this if the shift amount is known to be in bounds.
- KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
- if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
- EVT AmtVT = ShAmt.getValueType();
- SDValue AlignAmt =
- DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getSignedConstant(-32LL, DL, AmtVT));
- SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getConstant(31, DL, AmtVT));
- SDValue Mask = DAG.getNode(
- ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
- X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
- return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
- CC);
- }
- }
- }
-
// cmpeq(trunc(x),C) --> cmpeq(x,C)
// cmpne(trunc(x),C) --> cmpne(x,C)
// iff x upper bits are zero.
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 69a5115..522782a 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
}
}
-let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in {
- let mayStore = 1 in
- def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>;
- let mayLoad = 1 in
- def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
-}
-
-multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
- def Z0#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
- "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
- def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
- "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
- def Z1#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
- "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
- def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
- "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
-}
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
- defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
-
-let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
- defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
- defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
- defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
- let SchedRW = [WriteSystem] in {
- def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
- "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
- let isPseudo = true in {
- def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- }
-
- def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst),
- (ins GR16:$src1, GR16:$src2, TILE:$src),
- [(set TILE: $dst,
- (int_x86_ttransposed_internal GR16:$src1, GR16:$src2,
- TILE:$src))]>;
-
- let usesCustomInserter = 1 in {
- def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst,
- sibmem:$src1), []>;
- def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst,
- sibmem:$src1), []>;
- def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst,
- sibmem:$src1), []>;
- def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst,
- sibmem:$src1), []>;
- def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
- [(int_x86_ttransposed timm:$dst, timm:$src)]>;
- }
- }
-} // HasAMXTILE, HasAMXTRANSPOSE
-
-let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
- let Constraints = "$src1 = $dst" in
- def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>, VEX, VVVV, T8,XS;
- let Constraints = "$src4 = $dst" in
- def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6),
- [(set TILE: $dst,
- (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2,
- GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
- let usesCustomInserter = 1 in
- def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
- let Constraints = "$src1 = $dst" in
- def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>, VEX, VVVV, T8,XD;
- let Constraints = "$src4 = $dst" in
- def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6),
- [(set TILE: $dst,
- (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2,
- GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
- let usesCustomInserter = 1 in
- def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
- let Constraints = "$src1 = $dst" in {
- def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, VEX, VVVV, T8,XD;
- def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, VEX, VVVV, T8,XS;
- def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, VEX, VVVV, WIG, T8,PS;
- }
- def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
- "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD;
-
- let Constraints = "$src4 = $dst" in {
- def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6),
- [(set TILE: $dst,
- (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2,
- GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
- def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6),
- [(set TILE: $dst,
- (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2,
- GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
- def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6),
- [(set TILE: $dst,
- (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2,
- GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
- }
- def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3),
- [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>;
-
- let usesCustomInserter = 1 in {
- def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
- def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
- def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
- def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
- [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>;
- }
-}
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
- let isPseudo = true in {
- def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
- []>;
- }
- let usesCustomInserter = 1 in {
- def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
- def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
- def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
- def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
- }
-} // HasAMXMOVRS, HasAMXTRANSPOSE
-
multiclass TILELOADDRS_Base<string suffix> {
def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
"tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
@@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in {
}
} // SchedRW = [WriteSystem]
} // HasAMXTF32
-
-let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in {
- let SchedRW = [WriteSystem] in {
- let Constraints = "$src1 = $dst" in {
- def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst),
- (ins TILE:$src1, TILE:$src2, TILE:$src3),
- "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>, VEX, VVVV, T8, PS;
- }
- let Constraints = "$src4 = $dst" in {
- def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst),
- (ins GR16:$src1, GR16:$src2, GR16:$src3,
- TILE:$src4, TILE:$src5, TILE:$src6),
- [(set TILE:$dst,
- (int_x86_ttmmultf32ps_internal GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6))]>;
- }
- let usesCustomInserter = 1 in {
- def PTTMMULTF32PS : PseudoI<(outs),
- (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
- [(int_x86_ttmmultf32ps timm:$src1, timm:$src2,
- timm:$src3)]>;
- }
- } // SchedRW = [WriteSystem]
-} // HasAMXTF32, HasAMXTRANSPOSE
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5c23f91..6b2a7a4 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
: GET_EGPR_IF_ENABLED(X86::TILESTORED);
#undef GET_EGPR_IF_ENABLED
- case 2048:
- assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
- "Unknown 2048-byte regclass");
- assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
- return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
}
}
@@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) {
case X86::TILESTORED:
case X86::TILELOADD_EVEX:
case X86::TILESTORED_EVEX:
- case X86::PTILEPAIRLOAD:
- case X86::PTILEPAIRSTORE:
return true;
}
}
@@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
default:
llvm_unreachable("Unexpected special opcode!");
case X86::TILESTORED:
- case X86::TILESTORED_EVEX:
- case X86::PTILEPAIRSTORE: {
+ case X86::TILESTORED_EVEX: {
// tilestored %tmm, (%sp, %idx)
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
break;
}
case X86::TILELOADD:
- case X86::TILELOADD_EVEX:
- case X86::PTILEPAIRLOAD: {
+ case X86::TILELOADD_EVEX: {
// tileloadd (%sp, %idx), %tmm
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index 5207eca..6ba07f7 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
let ParserMatchClass = VK16PairAsmOperand;
}
-
-let RenderMethod = "addTILEPairOperands" in
- def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; }
-
-def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> {
- let ParserMatchClass = TILEPairAsmOperand;
-}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index c20bb05..98104a6f 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -183,7 +183,6 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">;
def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">;
-def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">;
def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">;
def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 8ffd454..2fc5d38 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) {
match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
}
-// Some instructions may return more than one tiles.
-// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
-static unsigned getNumDefTiles(IntrinsicInst *II) {
- Type *Ty = II->getType();
- if (Ty->isX86_AMXTy())
- return 1;
-
- unsigned Num = 0;
- for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) {
- Type *STy = Ty->getContainedType(i);
- if (STy->isX86_AMXTy())
- Num++;
- }
- return Num;
-}
-
static bool isAMXIntrinsic(Value *I) {
auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
@@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) {
return false;
// Check if return type or parameter is x86_amx. If it is x86_amx
// the intrinsic must be x86 amx intrinsics.
- if (getNumDefTiles(II) > 0)
+ if (II->getType()->isX86_AMXTy())
return true;
for (Value *V : II->args()) {
if (V->getType()->isX86_AMXTy())
@@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
llvm_unreachable("No terminator in the entry block!");
}
-class ShapeCalculator {
-private:
- const TargetMachine *TM = nullptr;
-
- // In AMX intrinsics we let Shape = {Row, Col}, but the
- // RealCol = Col / ElementSize. We may use the RealCol
- // as a new Row for other new created AMX intrinsics.
- std::map<Value *, Value *> Col2Row, Row2Col;
-
-public:
- ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {}
- std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
- std::pair<Value *, Value *> getShape(PHINode *Phi);
- Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
- Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
-};
-
-Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
- unsigned Granularity) {
- if (auto It = Col2Row.find(V); It != Col2Row.end())
- return It->second;
+static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) {
IRBuilder<> Builder(II);
Value *RealRow = nullptr;
if (isa<ConstantInt>(V))
@@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
}
- Col2Row[V] = RealRow;
return RealRow;
}
-Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
- unsigned Granularity) {
- if (auto It = Row2Col.find(V); It != Row2Col.end())
- return It->second;
- IRBuilder<> Builder(II);
- Value *RealCol = nullptr;
- if (isa<ConstantInt>(V))
- RealCol =
- Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity);
- else if (isa<Instruction>(V)) {
- Builder.SetInsertPoint(cast<Instruction>(V));
- RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
- cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
- } else {
- // When it is not a const value and it is a function argument, we create
- // Row at the entry bb.
- IRBuilder<> NewBuilder(
- getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
- RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity));
- }
- Row2Col[V] = RealCol;
- return RealCol;
-}
-
// TODO: Refine the row and col-in-bytes of tile to row and col of matrix.
-std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
- unsigned OpNo) {
- (void)TM;
+std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
IRBuilder<> Builder(II);
Value *Row = nullptr, *Col = nullptr;
switch (II->getIntrinsicID()) {
default:
llvm_unreachable("Expect amx intrinsics");
- case Intrinsic::x86_t2rpntlvwz0_internal:
- case Intrinsic::x86_t2rpntlvwz0t1_internal:
- case Intrinsic::x86_t2rpntlvwz1_internal:
- case Intrinsic::x86_t2rpntlvwz1t1_internal:
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal:
case Intrinsic::x86_tilestored64_internal:
@@ -271,13 +204,6 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
}
break;
}
- case Intrinsic::x86_ttransposed_internal:
- case Intrinsic::x86_tconjtfp16_internal: {
- assert((OpNo == 2) && "Illegal Operand Number.");
- Row = getRowFromCol(II, II->getArgOperand(1), 4);
- Col = getColFromRow(II, II->getArgOperand(0), 4);
- break;
- }
case Intrinsic::x86_tcvtrowd2ps_internal:
case Intrinsic::x86_tcvtrowps2bf16h_internal:
case Intrinsic::x86_tcvtrowps2bf16l_internal:
@@ -289,34 +215,12 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
Col = II->getArgOperand(1);
break;
}
- case Intrinsic::x86_ttdpbf16ps_internal:
- case Intrinsic::x86_ttdpfp16ps_internal:
- case Intrinsic::x86_ttcmmimfp16ps_internal:
- case Intrinsic::x86_ttcmmrlfp16ps_internal:
- case Intrinsic::x86_tconjtcmmimfp16ps_internal:
- case Intrinsic::x86_ttmmultf32ps_internal: {
- switch (OpNo) {
- case 3:
- Row = II->getArgOperand(0);
- Col = II->getArgOperand(1);
- break;
- case 4:
- Row = getRowFromCol(II, II->getArgOperand(2), 4);
- Col = getColFromRow(II, II->getArgOperand(0), 4);
- break;
- case 5:
- Row = getRowFromCol(II, II->getArgOperand(2), 4);
- Col = II->getArgOperand(1);
- break;
- }
- break;
- }
}
return std::make_pair(Row, Col);
}
-std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
+static std::pair<Value *, Value *> getShape(PHINode *Phi) {
Use &U = *(Phi->use_begin());
unsigned OpNo = U.getOperandNo();
User *V = U.getUser();
@@ -349,15 +253,14 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
namespace {
class X86LowerAMXType {
Function &Func;
- ShapeCalculator *SC;
// In AMX intrinsics we let Shape = {Row, Col}, but the
// RealCol = Col / ElementSize. We may use the RealCol
// as a new Row for other new created AMX intrinsics.
- std::map<Value *, Value *> Col2Row, Row2Col;
+ std::map<Value *, Value *> Col2Row;
public:
- X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {}
+ X86LowerAMXType(Function &F) : Func(F) {}
bool visit();
void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
@@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
Use &U = *(Bitcast->use_begin());
unsigned OpNo = U.getOperandNo();
auto *II = cast<IntrinsicInst>(U.getUser());
- std::tie(Row, Col) = SC->getShape(II, OpNo);
+ std::tie(Row, Col) = getShape(II, OpNo);
IRBuilder<> Builder(Bitcast);
// Use the maximun column as stride.
Value *Stride = Builder.getInt64(64);
@@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
Builder.CreateStore(Src, AllocaAddr);
// TODO we can pick an constant operand for the shape.
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = SC->getShape(II, OpNo);
+ std::tie(Row, Col) = getShape(II, OpNo);
std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
Value *NewInst =
Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args);
@@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) {
static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
- auto *II = dyn_cast<IntrinsicInst>(TileDef);
- unsigned Idx = 0;
- // Extract tile from multiple tiles' def.
- if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
- assert(Extr->hasIndices() && "Tile extract miss index!");
- Idx = Extr->getIndices()[0];
- II = cast<IntrinsicInst>(Extr->getOperand(0));
- }
+ auto *II = cast<IntrinsicInst>(TileDef);
assert(II && "Not tile intrinsic!");
- Value *Row = II->getOperand(Idx);
- Value *Col = II->getOperand(Idx + 1);
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
BasicBlock *BB = TileDef->getParent();
BasicBlock::iterator Iter = TileDef->getIterator();
@@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
// Get tile shape.
IntrinsicInst *II = nullptr;
- unsigned Idx = 0;
if (IsPHI) {
Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
II = cast<IntrinsicInst>(PhiOp);
- } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
- // Extract tile from multiple tiles' def.
- assert(Extr->hasIndices() && "Tile extract miss index!");
- Idx = Extr->getIndices()[0];
- II = cast<IntrinsicInst>(Extr->getOperand(0));
} else {
II = cast<IntrinsicInst>(V);
}
- Value *Row = II->getOperand(Idx);
- Value *Col = II->getOperand(Idx + 1);
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
Instruction *UserI = cast<Instruction>(U.getUser());
IRBuilder<> Builder(UserI);
@@ -848,12 +738,10 @@ namespace {
class X86LowerAMXCast {
Function &Func;
- ShapeCalculator *SC;
std::unique_ptr<DominatorTree> DT;
public:
- X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC)
- : Func(F), SC(ShapeC), DT(nullptr) {}
+ X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
bool combineTilezero(IntrinsicInst *Cast);
@@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
return false;
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = SC->getShape(OldPN);
+ std::tie(Row, Col) = getShape(OldPN);
// TODO: If it is not constant the Row and Col must domoniate tilezero
// that we are going to create.
if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
@@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
return true;
}
-static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
- bool IsRow) {
- if (!isAMXIntrinsic(Inst))
- return nullptr;
-
- auto *II = cast<IntrinsicInst>(Inst);
- if (IsRow)
- return II->getOperand(0);
-
- assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
- return II->getOperand(ShapeIdx + 1);
-}
-
// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
// store <256 x i32> %43, <256 x i32>* %p, align 64
// -->
@@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
if (!Tile->hasOneUse())
return false;
- // We don't fetch shape from tilestore, we only get shape from tiledef,
- // so we can set the max tile shape to tilestore for special cases.
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+
IRBuilder<> Builder(ST);
- Value *Row = nullptr;
- Value *Col = nullptr;
-
- if (isAMXIntrinsic(Tile)) {
- auto *II = cast<IntrinsicInst>(Tile);
- // Tile is output from AMX intrinsic. The first operand of the
- // intrinsic is row, the second operand of the intrinsic is column.
- Row = II->getOperand(0);
- Col = II->getOperand(1);
- } else {
- // Now we supported multi-tiles value in structure, so we may get tile
- // from extracting multi-tiles structure.
- // For example:
- // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
- // i16 %2, i16 %3, i8* %4, i64 %5)
- // %7 = extractvalue { x86_amx, x86_amx } %6, 0
- // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7)
- // store <256 x i32> %8, <256 x i32>* %0, align 1024
- //
- // TODO: Currently we only handle extractvalue case, enhance me for other
- // cases if possible.
- auto *II = cast<ExtractValueInst>(Tile);
- assert(II && "We meet unhandle source in fetching tile value!");
- unsigned ShapeIdx = II->getIndices()[0];
- Value *Tiles = II->getOperand(0);
- Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true);
- Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false);
- }
- assert(Row && Col && "Shape got failed!");
// Stride should be equal to col(measured by bytes)
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
// shape information through def-use chain.
if (!isAMXIntrinsic(II))
return false;
- std::tie(Row, Col) = SC->getShape(II, OpNo);
+ std::tie(Row, Col) = getShape(II, OpNo);
IRBuilder<> Builder(LD);
// Stride should be equal to col(measured by bytes)
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
if (!isAMXIntrinsic(II))
return false;
- std::tie(Row, Col) = SC->getShape(II, OpNo);
+ std::tie(Row, Col) = getShape(II, OpNo);
IRBuilder<> Builder(Cast);
Value *NewInst =
@@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
Builder.CreateStore(Src, AllocaAddr);
// TODO we can pick an constant operand for the shape.
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = SC->getShape(II, OpNo);
+ std::tie(Row, Col) = getShape(II, OpNo);
std::array<Value *, 4> Args = {
Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
Value *NewInst =
@@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM,
return false;
bool C = false;
- ShapeCalculator SC(TM);
- X86LowerAMXCast LAC(F, &SC);
+ X86LowerAMXCast LAC(F);
C |= LAC.combineAMXcast(TLI);
// There might be remaining AMXcast after combineAMXcast and they should be
// handled elegantly.
C |= LAC.transformAllAMXCast();
- X86LowerAMXType LAT(F, &SC);
+ X86LowerAMXType LAT(F);
C |= LAT.visit();
// Prepare for fast register allocation at O0.
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 2a1c499..8a1d00d 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass {
if (!MO.isReg() || !MO.getReg().isVirtual())
return false;
- unsigned Shapes = 0;
- if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
- Shapes = 1;
- if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID)
- Shapes = 2;
- if (!Shapes)
+ if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID)
return false;
- collectShapeInfo(MI, Shapes);
+ collectShapeInfo(MI);
return true;
}
@@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass {
}
/// Collect the shape def information for later use.
- void collectShapeInfo(MachineInstr &MI, unsigned Shapes);
+ void collectShapeInfo(MachineInstr &MI);
/// Try to hoist shapes definded below AMX instructions.
bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
@@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
"Tile Register Pre-configure", false, false)
-void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
MIRef MIR(MI, MBB);
auto &Refs = ShapeBBs[MBB];
@@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
Refs.insert(I, MIR);
};
- // All shapes have same row in multi-tile operand.
- SmallVector<Register, 8> WorkList;
- for (unsigned I = 1; I < Shapes + 2; ++I)
- WorkList.push_back(MI.getOperand(I).getReg());
+ SmallVector<Register, 8> WorkList(
+ {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
while (!WorkList.empty()) {
Register R = WorkList.pop_back_val();
MachineInstr *DefMI = MRI->getVRegDef(R);
@@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
continue;
- // This happens when column = 0 in multi-tile operand.
- if (DefMI->getOpcode() == X86::COPY) {
- MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg());
- if (MI && MI->isMoveImmediate())
- continue;
- }
-
if (DefMI->isPHI()) {
for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 76979e3..72f3813 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(*AI);
}
- // Reserve low half pair registers in case they are used by RA aggressively.
- Reserved.set(X86::TMM0_TMM1);
- Reserved.set(X86::TMM2_TMM3);
-
assert(checkAllSuperRegsMarked(Reserved,
{X86::SIL, X86::DIL, X86::BPL, X86::SPL,
X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
@@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
// and try to return the minimum number of registers supported by the target.
static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
(X86::K6_K7 + 1 == X86::TMMCFG) &&
- (X86::TMM6_TMM7 + 1 == X86::R16) &&
+ (X86::TMM7 + 1 == X86::R16) &&
(X86::R31WH + 1 == X86::NUM_TARGET_REGS),
"Register number may be incorrect");
@@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
}
bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
- return RC->getID() == X86::TILERegClassID ||
- RC->getID() == X86::TILEPAIRRegClassID;
+ return RC->getID() == X86::TILERegClassID;
}
void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
@@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PTDPFP16PSV:
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
- case X86::PTTRANSPOSEDV:
- case X86::PTTDPBF16PSV:
- case X86::PTTDPFP16PSV:
- case X86::PTTCMMIMFP16PSV:
- case X86::PTTCMMRLFP16PSV:
- case X86::PTCONJTCMMIMFP16PSV:
- case X86::PTCONJTFP16V:
case X86::PTILELOADDRSV:
case X86::PTILELOADDRST1V:
case X86::PTMMULTF32PSV:
- case X86::PTTMMULTF32PSV:
case X86::PTDPBF8PSV:
case X86::PTDPBHF8PSV:
case X86::PTDPHBF8PSV:
@@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
VRM->assignVirt2Shape(VirtReg, Shape);
return Shape;
}
- case X86::PT2RPNTLVWZ0V:
- case X86::PT2RPNTLVWZ0T1V:
- case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V:
- case X86::PT2RPNTLVWZ0RSV:
- case X86::PT2RPNTLVWZ0RST1V:
- case X86::PT2RPNTLVWZ1RSV:
- case X86::PT2RPNTLVWZ1RST1V: {
- MachineOperand &MO1 = MI->getOperand(1);
- MachineOperand &MO2 = MI->getOperand(2);
- MachineOperand &MO3 = MI->getOperand(3);
- ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI);
- VRM->assignVirt2Shape(VirtReg, Shape);
- return Shape;
- }
- }
-}
-
-static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) {
- unsigned PhysShapeNum = PhysShape.getShapeNum();
- unsigned VirtShapeNum = VirtShape.getShapeNum();
-
- if (PhysShapeNum < VirtShapeNum)
- return false;
-
- if (PhysShapeNum == VirtShapeNum) {
- if (PhysShapeNum == 1)
- return PhysShape == VirtShape;
-
- for (unsigned I = 0; I < PhysShapeNum; I++) {
- ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
- ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I));
- if (VShape != PShape)
- return false;
- }
- return true;
- }
-
- // Hint subreg of mult-tile reg to single tile reg.
- if (VirtShapeNum == 1) {
- for (unsigned I = 0; I < PhysShapeNum; I++) {
- ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
- if (VirtShape == PShape)
- return true;
- }
}
-
- // Note: Currently we have no requirement for case of
- // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum)
- return false;
}
bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
@@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
if (!VRM)
return BaseImplRetVal;
- if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) {
+ if (ID != X86::TILERegClassID) {
if (DisableRegAllocNDDHints || !ST.hasNDD() ||
!TRI.isGeneralPurposeRegisterClass(&RC))
return BaseImplRetVal;
@@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
return;
}
ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
- if (canHintShape(PhysShape, VirtShape))
+ if (PhysShape == VirtShape)
Hints.push_back(PhysReg);
};
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 99b7910..692e42a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -30,8 +30,6 @@ let Namespace = "X86" in {
def sub_ymm : SubRegIndex<256>;
def sub_mask_0 : SubRegIndex<-1>;
def sub_mask_1 : SubRegIndex<-1, -1>;
- def sub_t0 : SubRegIndex<8192>;
- def sub_t1 : SubRegIndex<8192, 8192>;
}
//===----------------------------------------------------------------------===//
@@ -432,10 +430,6 @@ def TMM4: X86Reg<"tmm4", 4>;
def TMM5: X86Reg<"tmm5", 5>;
def TMM6: X86Reg<"tmm6", 6>;
def TMM7: X86Reg<"tmm7", 7>;
-// TMM register pairs
-def TPAIRS : RegisterTuples<[sub_t0, sub_t1],
- [(add TMM0, TMM2, TMM4, TMM6),
- (add TMM1, TMM3, TMM5, TMM7)]>;
}
// Floating point stack registers. These don't map one-to-one to the FP
@@ -862,9 +856,6 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
-// Need check alignment 3rd operand size=1024*2*8
-let isAllocatable = 1 in
-def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;}
//===----------------------------------------------------------------------===//
// Register categories.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3d8d0a23..0b1430e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6562,7 +6562,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
const Function *Callee,
- const ArrayRef<Type *> &Types) const {
+ ArrayRef<Type *> Types) const {
if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
return false;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 133b366..de5e1c2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -296,7 +296,7 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const override;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Type) const override;
+ ArrayRef<Type *> Type) const override;
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
return ST->getMaxInlineSizeThreshold();
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 17a44dd..09ef8fb 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
false)
-unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) {
- if (Reg.isVirtual()) {
- unsigned RegClassID = MRI->getRegClass(Reg)->getID();
- if (RegClassID == X86::TILERegClassID)
- return 1;
- if (RegClassID == X86::TILEPAIRRegClassID)
- return 2;
- } else {
- if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return 1;
- if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
- return 2;
- }
- return 0;
-}
-
-static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM,
- Register VirtReg,
- SmallVector<ShapeT, 8> &Phys2Shapes) {
- unsigned Num = getAMXRegNum(MRI, VirtReg);
- MCRegister PhysReg = VRM.getPhys(VirtReg);
- if (!PhysReg)
- return;
-
- if (Num == 1) {
- unsigned Index = PhysReg - X86::TMM0;
- if (!Phys2Shapes[Index].isValid()) {
- ShapeT Shape = VRM.getShape(VirtReg);
- Phys2Shapes[Index] = std::move(Shape);
- return;
- }
- }
- // Split tile pair shape info to 2 single tile shape info. e.g:
- // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes.
- if (Num == 2) {
- unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2;
- unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1;
-
- ShapeT Shape = VRM.getShape(VirtReg);
- assert(Shape.getShapeNum() == 2 && "Unexpected shape number!");
-
- if (!Phys2Shapes[Index0].isValid()) {
- ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI);
- Phys2Shapes[Index0] = std::move(Shape0);
- }
-
- if (!Phys2Shapes[Index1].isValid()) {
- ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI);
- Phys2Shapes[Index1] = std::move(Shape1);
- }
- }
-}
-
-static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) {
- return getAMXRegNum(MRI, Reg) > 0;
-}
-
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
// Early exit in the common case of non-AMX code.
@@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
return false;
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const X86RegisterInfo *TRI = ST.getRegisterInfo();
const TargetInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
@@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
assert(ConstMI && "Cannot find an insertion point");
unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
- SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT());
+ SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
Register VirtReg = Register::index2VirtReg(I);
if (MRI.reg_nodbg_empty(VirtReg))
continue;
- if (!isAMXRegClass(&MRI, VirtReg))
+ if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg)))
+ continue;
+ MCRegister PhysReg = VRM.getPhys(VirtReg);
+ if (!PhysReg)
continue;
- collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes);
+ unsigned Index = PhysReg - X86::TMM0;
+ if (!Phys2Virt[Index])
+ Phys2Virt[Index] = VirtReg;
}
// Fill in the shape of each tile physical register.
for (unsigned I = 0; I < AMXRegNum; ++I) {
- ShapeT Shape = Phys2Shapes[I];
- if (!Shape.isValid())
+ if (!Phys2Virt[I])
continue;
DebugLoc DL;
bool IsRow = true;
MachineInstr *NewMI = nullptr;
+ ShapeT Shape = VRM.getShape(Phys2Virt[I]);
for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
// Here is the data format for the tile config.
// 0 palette
@@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
"Cannot initialize with different shapes");
continue;
}
- if (DefMI.getOperand(1).isImm()) {
- Imm = DefMI.getOperand(1).getImm();
- } else {
- assert(DefMI.getOpcode() == X86::MOV32r0 &&
- "The opcode is assumed to be MOV32r0 if the operand is not "
- "immediate.");
- Imm = 0;
- }
+ Imm = DefMI.getOperand(1).getImm();
NewMI = addFrameReference(
BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0849fc7..c164762 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2192,7 +2192,6 @@ StringMap<bool> sys::getHostCPUFeatures() {
bool HasLeaf1E = MaxLevel >= 0x1e &&
!getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
- Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index d510445..f74d670 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -48,9 +48,9 @@ StringRef normalizeCPUName(StringRef CPUName) {
// accepting it. Clang has always ignored it and passed the
// generic CPU ID to the back end.
return StringSwitch<StringRef>(CPUName)
- .Cases("common", "405", "generic")
- .Cases("ppc440", "440fp", "440")
- .Cases("630", "power3", "pwr3")
+ .Cases({"common", "405"}, "generic")
+ .Cases({"ppc440", "440fp"}, "440")
+ .Cases({"630", "power3"}, "pwr3")
.Case("G3", "g3")
.Case("G4", "g4")
.Case("G4+", "g4+")
@@ -69,7 +69,7 @@ StringRef normalizeCPUName(StringRef CPUName) {
.Case("power9", "pwr9")
.Case("power10", "pwr10")
.Case("power11", "pwr11")
- .Cases("powerpc", "powerpc32", "ppc")
+ .Cases({"powerpc", "powerpc32"}, "ppc")
.Case("powerpc64", "ppc64")
.Case("powerpc64le", "ppc64le")
.Default(CPUName);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index b13c795..37e8ad9 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids =
FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS |
- FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE;
+ FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32;
// Intel Atom processors.
// Bonnell has feature parity with Core2 and adds MOVBE.
@@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
-constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
FeatureAMX_TILE | FeatureAVX10_2;
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index 2f256df..b72d41a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
}
+static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
+ return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
+ AllocInfo->Info.getAllocCount(),
+ AllocInfo->Info.getTotalLifetime());
+}
+
static AllocationType addCallStack(CallStackTrie &AllocTrie,
const AllocationInfo *AllocInfo,
uint64_t FullStackId) {
SmallVector<uint64_t> StackIds;
for (const auto &StackFrame : AllocInfo->CallStack)
StackIds.push_back(computeStackId(StackFrame));
- auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
- AllocInfo->Info.getAllocCount(),
- AllocInfo->Info.getTotalLifetime());
+ auto AllocType = getAllocType(AllocInfo);
std::vector<ContextTotalSize> ContextSizeInfo;
if (recordContextSizeInfoForAnalysis()) {
auto TotalSize = AllocInfo->Info.getTotalSize();
@@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,
const std::set<const AllocationInfo *> &AllocInfoSet,
std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
&FullStackIdToAllocMatchInfo) {
+ // TODO: Remove this once the profile creation logic deduplicates contexts
+ // that are the same other than the IsInlineFrame bool. Until then, keep the
+ // largest.
+ DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
+ for (auto *AllocInfo : AllocInfoSet) {
+ auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+ auto [It, Inserted] =
+ UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo});
+ // If inserted entry, done.
+ if (Inserted)
+ continue;
+ // Keep the larger one, or the noncold one if they are the same size.
+ auto CurSize = It->second->Info.getTotalSize();
+ auto NewSize = AllocInfo->Info.getTotalSize();
+ if ((CurSize > NewSize) ||
+ (CurSize == NewSize &&
+ getAllocType(AllocInfo) != AllocationType::NotCold))
+ continue;
+ It->second = AllocInfo;
+ }
// We may match this instruction's location list to multiple MIB
// contexts. Add them to a Trie specialized for trimming the contexts to
// the minimal needed to disambiguate contexts with unique behavior.
CallStackTrie AllocTrie(&ORE, MaxColdSize);
uint64_t TotalSize = 0;
uint64_t TotalColdSize = 0;
- for (auto *AllocInfo : AllocInfoSet) {
+ for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
// Check the full inlined call stack against this one.
// If we found and thus matched all frames on the call, include
// this MIB.
if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
- uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
- FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
TotalSize += AllocInfo->Info.getTotalSize();
if (AllocType == AllocationType::Cold)
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 78d4a57e..87eba5f 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -58,6 +58,18 @@ static cl::opt<bool>
cl::desc("Writes always set the type"), cl::Hidden,
cl::init(false));
+static cl::opt<bool> ClOutlineInstrumentation(
+ "tysan-outline-instrumentation",
+ cl::desc("Uses function calls for all TySan instrumentation, reducing "
+ "ELF size"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClVerifyOutlinedInstrumentation(
+ "tysan-verify-outlined-instrumentation",
+ cl::desc("Check types twice with both inlined instrumentation and "
+ "function calls. This verifies that they behave the same."),
+ cl::Hidden, cl::init(false));
+
STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
namespace {
@@ -105,12 +117,16 @@ private:
Regex AnonNameRegex;
Type *IntptrTy;
uint64_t PtrShift;
- IntegerType *OrdTy;
+ IntegerType *OrdTy, *U64Ty;
/// Callbacks to run-time library are computed in initializeCallbacks.
FunctionCallee TysanCheck;
FunctionCallee TysanCtorFunction;
+ FunctionCallee TysanIntrumentMemInst;
+ FunctionCallee TysanInstrumentWithShadowUpdate;
+ FunctionCallee TysanSetShadowType;
+
/// Callback to set types for gloabls.
Function *TysanGlobalsSetTypeFunction;
};
@@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M)
void TypeSanitizer::initializeCallbacks(Module &M) {
IRBuilder<> IRB(M.getContext());
OrdTy = IRB.getInt32Ty();
+ U64Ty = IRB.getInt64Ty();
+ Type *BoolType = IRB.getInt1Ty();
AttributeList Attr;
Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
@@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) {
TysanCtorFunction =
M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy());
+
+ TysanIntrumentMemInst = M.getOrInsertFunction(
+ "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer of data to be written to
+ IRB.getPtrTy(), // Pointer of data to write
+ U64Ty, // Size of the data in bytes
+ BoolType // Do we need to call memmove
+ );
+
+ TysanInstrumentWithShadowUpdate = M.getOrInsertFunction(
+ "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer to data to be read
+ IRB.getPtrTy(), // Pointer to type descriptor
+ BoolType, // Do we need to type check this
+ U64Ty, // Size of data we access in bytes
+ OrdTy // Flags
+ );
+
+ TysanSetShadowType = M.getOrInsertFunction(
+ "__tysan_set_shadow_type", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer of data to be written to
+ IRB.getPtrTy(), // Pointer to the new type descriptor
+ U64Ty // Size of data we access in bytes
+ );
}
void TypeSanitizer::instrumentGlobals(Module &M) {
@@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
+ if (ClOutlineInstrumentation) {
+ if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) {
+ // We need to check the type here. If the type is unknown, then the read
+ // sets the type. If the type is known, then it is checked. If the type
+ // doesn't match, then we call the runtime type check (which may yet
+ // determine that the mismatch is okay).
+
+ Constant *Flags =
+ ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
+
+ IRB.CreateCall(TysanInstrumentWithShadowUpdate,
+ {Ptr, TD,
+ SanitizeFunction ? IRB.getTrue() : IRB.getFalse(),
+ IRB.getInt64(AccessSize), Flags});
+ } else if (ForceSetType || IsWrite) {
+ // In the mode where writes always set the type, for a write (which does
+ // not also read), we just set the type.
+ IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)});
+ }
+
+ return true;
+ }
+
Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
ShadowBase, AppMemMask);
Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0);
@@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
}
}
- if (!ShadowBase)
- ShadowBase = getShadowBase(*F);
- if (!AppMemMask)
- AppMemMask = getAppMemMask(*F);
+ if (ClOutlineInstrumentation) {
+ if (!Src)
+ Src = ConstantPointerNull::get(IRB.getPtrTy());
- Value *ShadowDataInt = IRB.CreateAdd(
- IRB.CreateShl(
- IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
- PtrShift),
- ShadowBase);
- Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
-
- if (!Src) {
- IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
- Align(1ull << PtrShift));
+ IRB.CreateCall(
+ TysanIntrumentMemInst,
+ {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()});
return true;
- }
-
- Value *SrcShadowDataInt = IRB.CreateAdd(
- IRB.CreateShl(
- IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
- PtrShift),
- ShadowBase);
- Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
-
- if (NeedsMemMove) {
- IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
- Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
} else {
- IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
- Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ if (!ShadowBase)
+ ShadowBase = getShadowBase(*F);
+ if (!AppMemMask)
+ AppMemMask = getAppMemMask(*F);
+
+ Value *ShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+ if (!Src) {
+ IRB.CreateMemSet(ShadowData, IRB.getInt8(0),
+ IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift));
+ return true;
+ }
+
+ Value *SrcShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+ if (NeedsMemMove) {
+ IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ } else {
+ IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ }
}
return true;
@@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M,
for (Function &F : M) {
const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
TySan.sanitizeFunction(F, TLI);
+ if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) {
+ // Outlined instrumentation is a new option, and so this exists to
+ // verify there is no difference in behaviour between the options.
+ // If the outlined instrumentation triggers a verification failure
+ // when the original inlined instrumentation does not, or vice versa,
+ // then there is a discrepency which should be investigated.
+ ClOutlineInstrumentation = false;
+ TySan.sanitizeFunction(F, TLI);
+ ClOutlineInstrumentation = true;
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 3487e81..7e70ba2 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {
} // namespace
-static bool isUniformShape(Value *V) {
+static bool isShapePreserving(Value *V) {
Instruction *I = dyn_cast<Instruction>(V);
if (!I)
return true;
+ if (isa<SelectInst>(I))
+ return true;
+
if (I->isBinaryOp())
return true;
@@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) {
}
}
+/// Return an iterator over the operands of \p I that should share shape
+/// information with \p I.
+static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) {
+ assert(isShapePreserving(I) &&
+ "Can't retrieve shaped operands for an instruction that does not "
+ "preserve shape information");
+ auto Ops = I->operands();
+ return isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+}
+
/// Return the ShapeInfo for the result of \p I, it it can be determined.
static std::optional<ShapeInfo>
computeShapeInfoForInst(Instruction *I,
@@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I,
return OpShape->second;
}
- if (isUniformShape(I) || isa<SelectInst>(I)) {
- auto Ops = I->operands();
- auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+ if (isShapePreserving(I)) {
+ auto ShapedOps = getShapedOperandsForInst(I);
// Find the first operand that has a known shape and use that.
for (auto &Op : ShapedOps) {
auto OpShape = ShapeMap.find(Op.get());
@@ -710,10 +722,9 @@ public:
case Intrinsic::matrix_column_major_store:
return true;
default:
- return isUniformShape(II);
+ break;
}
- return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||
- isa<SelectInst>(V);
+ return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
}
/// Propagate the shape information of instructions to their users.
@@ -800,9 +811,8 @@ public:
} else if (isa<StoreInst>(V)) {
// Nothing to do. We forward-propagated to this so we would just
// backward propagate to an instruction with an already known shape.
- } else if (isUniformShape(V) || isa<SelectInst>(V)) {
- auto Ops = cast<Instruction>(V)->operands();
- auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;
+ } else if (isShapePreserving(V)) {
+ auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V));
// Propagate to all operands.
ShapeInfo Shape = ShapeMap[V];
for (Use &U : ShapedOps) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4fe736a..94dfd3a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
- unsigned EstimatedLoopInvocationWeight = 0;
std::optional<unsigned> OriginalTripCount =
- llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight);
+ llvm::getLoopEstimatedTripCount(L);
+ BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);
// Effectively "DCE" unrolled iterations that are beyond the max tripcount
// and will never be executed.
@@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
: isEpilogProfitable(L);
if (ULO.Runtime &&
- !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
- EpilogProfitability, ULO.UnrollRemainder,
- ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
- PreserveLCSSA, ULO.SCEVExpansionBudget,
- ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
+ !UnrollRuntimeLoopRemainder(
+ L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+ ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+ PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+ RemainderLoop, OriginalTripCount, OriginalLoopProb)) {
if (ULO.Force)
ULO.Runtime = false;
else {
@@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
LI->erase(L);
// We shouldn't try to use `L` anymore.
L = nullptr;
- } else if (OriginalTripCount) {
- // Update the trip count. Note that the remainder has already logic
- // computing it in `UnrollRuntimeLoopRemainder`.
- setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count,
- EstimatedLoopInvocationWeight);
+ } else {
+ // Update metadata for the loop's branch weights and estimated trip count:
+ // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch
+ // weights, latch branch weights, and estimated trip count of the
+ // remainder loop it creates. It also sets the branch weights for the
+ // unrolled loop guard it creates. The branch weights for the unrolled
+ // loop latch are adjusted below. FIXME: Handle prologue loops.
+ // - Otherwise, if unrolled loop iteration latches become unconditional,
+ // branch weights are adjusted above. FIXME: Actually handle such
+ // unconditional latches.
+ // - Otherwise, the original loop's branch weights are correct for the
+ // unrolled loop, so do not adjust them.
+ // - In all cases, the unrolled loop's estimated trip count is set below.
+ //
+ // As an example of the last case, consider what happens if the unroll count
+ // is 4 for a loop with an estimated trip count of 10 when we do not create
+ // a remainder loop and all iterations' latches remain conditional. Each
+ // unrolled iteration's latch still has the same probability of exiting the
+ // loop as it did when in the original loop, and thus it should still have
+ // the same branch weights. Each unrolled iteration's non-zero probability
+ // of exiting already appropriately reduces the probability of reaching the
+ // remaining iterations just as it did in the original loop. Trying to also
+ // adjust the branch weights of the final unrolled iteration's latch (i.e.,
+ // the backedge for the unrolled loop as a whole) to reflect its new trip
+ // count of 3 will erroneously further reduce its block frequencies.
+ // However, in case an analysis later needs to estimate the trip count of
+ // the unrolled loop as a whole without considering the branch weights for
+ // each unrolled iteration's latch within it, we store the new trip count as
+ // separate metadata.
+ if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) {
+ // Where p is always the probability of executing at least 1 more
+ // iteration, the probability for at least n more iterations is p^n.
+ setLoopProbability(L, OriginalLoopProb.pow(ULO.Count));
+ }
+ if (OriginalTripCount) {
+ unsigned NewTripCount = *OriginalTripCount / ULO.Count;
+ if (!ULO.Runtime && *OriginalTripCount % ULO.Count)
+ ++NewTripCount;
+ setLoopEstimatedTripCount(L, NewTripCount);
+ }
}
// LoopInfo should not be valid, confirm that.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 7a2b8da..1e8f6cc 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -40,6 +40,7 @@
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cmath>
using namespace llvm;
@@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
}
}
+/// Assume, due to our position in the remainder loop or its guard, anywhere
+/// from 0 to \p N more iterations can possibly execute. Among such cases in
+/// the original loop (with loop probability \p OriginalLoopProb), what is the
+/// probability of executing at least one more iteration?
+static BranchProbability
+probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+ // Each of these variables holds the original loop's probability that the
+ // number of iterations it will execute is some m in the specified range.
+ BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
+ BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m
+ BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N
+ BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
+ return ProbOneNotTooMany / ProbNotTooMany;
+}
+
/// Connect the unrolling epilog code to the original loop.
/// The unrolling epilog code contains code to execute the
/// 'extra' iterations if the run-time trip count modulo the
@@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
ValueToValueMapTy &VMap, DominatorTree *DT,
LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
- unsigned Count, AssumptionCache &AC) {
+ unsigned Count, AssumptionCache &AC,
+ BranchProbability OriginalLoopProb) {
BasicBlock *Latch = L->getLoopLatch();
assert(Latch && "Loop must have a latch");
BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
PreserveLCSSA);
// Add the branch to the exit block (around the epilog loop)
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*Latch->getTerminator())) {
+ if (OriginalLoopProb.isUnknown() &&
+ hasBranchWeightMD(*Latch->getTerminator())) {
// Assume equal distribution in interval [0, Count).
MDBuilder MDB(B.getContext());
BranchWeights = MDB.createBranchWeights(1, Count - 1);
}
- B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+ BranchInst *RemainderLoopGuard =
+ B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+ if (!OriginalLoopProb.isUnknown()) {
+ setBranchProbability(RemainderLoopGuard,
+ probOfNextInRemainder(OriginalLoopProb, Count - 1),
+ /*ForFirstTarget=*/true);
+ }
InsertPt->eraseFromParent();
if (DT) {
auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
@@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
/// The cloned blocks should be inserted between InsertTop and InsertBot.
/// InsertTop should be new preheader, InsertBot new loop exit.
/// Returns the new cloned loop that is created.
-static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
- const bool UnrollRemainder,
- BasicBlock *InsertTop,
- BasicBlock *InsertBot, BasicBlock *Preheader,
+static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
+ const bool UseEpilogRemainder,
+ const bool UnrollRemainder, BasicBlock *InsertTop,
+ BasicBlock *InsertBot, BasicBlock *Preheader,
std::vector<BasicBlock *> &NewBlocks,
LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
- DominatorTree *DT, LoopInfo *LI, unsigned Count) {
+ DominatorTree *DT, LoopInfo *LI, unsigned Count,
+ std::optional<unsigned> OriginalTripCount,
+ BranchProbability OriginalLoopProb) {
StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
@@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*LatchBR)) {
+ if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+ hasBranchWeightMD(*LatchBR)) {
uint32_t ExitWeight;
uint32_t BackEdgeWeight;
if (Count >= 3) {
@@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
MDBuilder MDB(Builder.getContext());
BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
}
- Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+ BranchInst *RemainderLoopLatch =
+ Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+ if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+ // Compute the total frequency of the original loop body from the
+ // remainder iterations. Once we've reached them, the first of them
+ // always executes, so its frequency and probability are 1.
+ double FreqRemIters = 1;
+ if (Count > 2) {
+ BranchProbability ProbReaching = BranchProbability::getOne();
+ for (unsigned N = Count - 2; N >= 1; --N) {
+ ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
+ FreqRemIters += double(ProbReaching.getNumerator()) /
+ ProbReaching.getDenominator();
+ }
+ }
+ // Solve for the loop probability that would produce that frequency.
+ // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
+ double ProbDouble = 1 - 1 / FreqRemIters;
+ BranchProbability Prob = BranchProbability::getBranchProbability(
+ std::round(ProbDouble * BranchProbability::getDenominator()),
+ BranchProbability::getDenominator());
+ setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true);
+ }
NewIdx->addIncoming(Zero, InsertTop);
NewIdx->addIncoming(IdxNext, NewBB);
LatchBR->eraseFromParent();
@@ -461,6 +509,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
Loop *NewLoop = NewLoops[L];
assert(NewLoop && "L should have been cloned");
+ if (OriginalTripCount && UseEpilogRemainder)
+ setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count);
+
// Add unroll disable metadata to disable future unrolling for this loop.
if (!UnrollRemainder)
NewLoop->setLoopAlreadyUnrolled();
@@ -588,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
const TargetTransformInfo *TTI, bool PreserveLCSSA,
unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
- Loop **ResultLoop) {
+ Loop **ResultLoop, std::optional<unsigned> OriginalTripCount,
+ BranchProbability OriginalLoopProb) {
LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
LLVM_DEBUG(L->dump());
LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -808,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder(
BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
// Branch to either remainder (extra iterations) loop or unrolling loop.
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*Latch->getTerminator())) {
+ if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+ hasBranchWeightMD(*Latch->getTerminator())) {
// Assume loop is nearly always entered.
MDBuilder MDB(B.getContext());
BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);
}
- B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+ BranchInst *UnrollingLoopGuard =
+ B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+ if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+ // The original loop's first iteration always happens. Compute the
+ // probability of the original loop executing Count-1 iterations after that
+ // to complete the first iteration of the unrolled loop.
+ BranchProbability ProbOne = OriginalLoopProb;
+ BranchProbability ProbRest = ProbOne.pow(Count - 1);
+ setBranchProbability(UnrollingLoopGuard, ProbRest,
+ /*ForFirstTarget=*/false);
+ }
PreHeaderBR->eraseFromParent();
if (DT) {
if (UseEpilogRemainder)
@@ -840,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder(
// iterations. This function adds the appropriate CFG connections.
BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
- Loop *remainderLoop = CloneLoopBlocks(
- L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
- NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count);
+ Loop *remainderLoop =
+ CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
+ InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
+ LI, Count, OriginalTripCount, OriginalLoopProb);
// Insert the cloned blocks into the function.
F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());
@@ -941,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
// Connect the epilog code to the original loop and update the
// PHI functions.
ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
- NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
+ NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC,
+ OriginalLoopProb);
// Update counter in loop for unrolling.
// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b6ba822..8be471b 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount(
if (LatchBranch->getSuccessor(0) != L->getHeader())
std::swap(BackedgeTakenWeight, LatchExitWeight);
- MDBuilder MDB(LatchBranch->getContext());
-
// Set/Update profile metadata.
- LatchBranch->setMetadata(
- LLVMContext::MD_prof,
- MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+ setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight},
+ /*IsExpected=*/false);
+
+ return true;
+}
+
+BranchProbability llvm::getLoopProbability(Loop *L) {
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return BranchProbability::getUnknown();
+ bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+ return getBranchProbability(LatchBranch, FirstTargetIsLoop);
+}
+bool llvm::setLoopProbability(Loop *L, BranchProbability P) {
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return false;
+ bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+ return setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+}
+
+BranchProbability llvm::getBranchProbability(BranchInst *B,
+ bool ForFirstTarget) {
+ if (B->getNumSuccessors() != 2)
+ return BranchProbability::getUnknown();
+ uint64_t Weight0, Weight1;
+ if (!extractBranchWeights(*B, Weight0, Weight1))
+ return BranchProbability::getUnknown();
+ if (!ForFirstTarget)
+ std::swap(Weight0, Weight1);
+ return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1);
+}
+
+bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
+ bool ForFirstTarget) {
+ if (B->getNumSuccessors() != 2)
+ return false;
+ BranchProbability Prob0 = P;
+ BranchProbability Prob1 = P.getCompl();
+ if (!ForFirstTarget)
+ std::swap(Prob0, Prob1);
+ setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()},
+ /*IsExpected=*/false);
return true;
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7f6d779..cbc604e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -80,6 +80,7 @@
#include <algorithm>
#include <cassert>
#include <climits>
+#include <cmath>
#include <cstddef>
#include <cstdint>
#include <iterator>
@@ -5955,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
}
// Update weight for the newly-created conditional branch.
- if (hasBranchWeightMD(*SI)) {
+ if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
SmallVector<uint64_t, 8> Weights;
getBranchWeights(SI, Weights);
if (Weights.size() == 1 + SI->getNumCases()) {
@@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
auto *DefaultCaseBB = SI->getDefaultDest();
BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
auto It = OrigBB->getTerminator()->getIterator();
+ SmallVector<uint32_t> Weights;
+ auto HasWeights =
+ !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights);
auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) {
+ // IsPow2 covers a subset of the cases in which we'd go to the default
+ // label. The other is those powers of 2 that don't appear in the case
+ // statement. We don't know the distribution of the values coming in, so
+ // the safest is to split 50-50 the original probability to `default`.
+ uint64_t OrigDenominator = sum_of(map_range(
+ Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+ SmallVector<uint64_t> NewWeights(2);
+ NewWeights[1] = Weights[0] / 2;
+ NewWeights[0] = OrigDenominator - NewWeights[1];
+ setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
+
+ // For the original switch, we reduce the weight of the default by the
+ // amount by which the previous branch contributes to getting to default,
+ // and then make sure the remaining weights have the same relative ratio
+ // wrt eachother.
+ uint64_t CasesDenominator = OrigDenominator - Weights[0];
+ Weights[0] /= 2;
+ for (auto &W : drop_begin(Weights))
+ W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
+
+ setBranchWeights(*SI, Weights, /*IsExpected=*/false);
+ }
// BI is handling the default case for SI, and so should share its DebugLoc.
BI->setDebugLoc(SI->getDebugLoc());
It->eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003..04b0562 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,7 +167,7 @@ public:
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
return tryInsertInstruction(
- new VPInstruction(Opcode, Operands, Flags, DL, Name));
+ new VPInstruction(Opcode, Operands, Flags, {}, DL, Name));
}
VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
@@ -184,7 +184,7 @@ public:
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
return tryInsertInstruction(
- new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
+ new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
}
VPInstruction *createNot(VPValue *Operand,
@@ -205,7 +205,7 @@ public:
return tryInsertInstruction(new VPInstruction(
Instruction::BinaryOps::Or, {LHS, RHS},
- VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
+ VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name));
}
VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
@@ -221,7 +221,7 @@ public:
std::optional<FastMathFlags> FMFs = std::nullopt) {
auto *Select =
FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
- *FMFs, DL, Name)
+ *FMFs, {}, DL, Name)
: new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
DL, Name);
return tryInsertInstruction(Select);
@@ -235,7 +235,7 @@ public:
assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
return tryInsertInstruction(
- new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
+ new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name));
}
/// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
@@ -246,7 +246,7 @@ public:
assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
return tryInsertInstruction(
- new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+ new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name));
}
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -254,7 +254,7 @@ public:
const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
- GEPNoWrapFlags::none(), DL, Name));
+ GEPNoWrapFlags::none(), {}, DL, Name));
}
VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -262,7 +262,7 @@ public:
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
return tryInsertInstruction(new VPInstruction(
- VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
+ VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name));
}
VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -270,7 +270,7 @@ public:
const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset},
- GEPNoWrapFlags::none(), DL, Name));
+ GEPNoWrapFlags::none(), {}, DL, Name));
}
VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL,
@@ -280,8 +280,7 @@ public:
VPValue *createElementCount(Type *Ty, ElementCount EC) {
VPlan &Plan = *getInsertBlock()->getPlan();
- VPValue *RuntimeEC =
- Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+ VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
if (EC.isScalable()) {
VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
RuntimeEC = EC.getKnownMinValue() == 1
@@ -304,9 +303,11 @@ public:
}
VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
- Type *ResultTy, DebugLoc DL) {
+ Type *ResultTy, DebugLoc DL,
+ const VPIRFlags &Flags = {},
+ const VPIRMetadata &Metadata = {}) {
return tryInsertInstruction(
- new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL));
+ new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
}
VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 25bf49d..e5c3f17 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7752,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands);
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
- VPValue *One =
- Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+ VPValue *One = Plan.getConstantInt(I->getType(), 1u);
auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, Ops);
@@ -7806,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
}
case Instruction::ExtractValue: {
SmallVector<VPValue *> NewOps(Operands);
- Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
auto *EVI = cast<ExtractValueInst>(I);
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
unsigned Idx = EVI->getIndices()[0];
- NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+ NewOps.push_back(Plan.getConstantInt(32, Idx));
return new VPWidenRecipe(*I, NewOps);
}
};
@@ -8179,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero =
- Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+ VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
@@ -8643,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
} else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
CurrentLinkI->getOpcode() == Instruction::Sub) {
Type *PhiTy = PhiR->getUnderlyingValue()->getType();
- auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+ auto *Zero = Plan->getConstantInt(PhiTy, 0);
VPWidenRecipe *Sub = new VPWidenRecipe(
Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8857,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
ToDelete.push_back(Select);
// Convert the reduction phi to operate on bools.
- PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
- OrigLoop->getHeader()->getContext())));
+ PhiR->setOperand(0, Plan->getFalse());
continue;
}
@@ -8880,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned ScaleFactor =
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
.value_or(1);
- Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
- auto *ScaleFactorVPV =
- Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+ auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
VPValue *StartV = PHBuilder.createNaryOp(
VPInstruction::ReductionStartVector,
{PhiR->getStartValue(), Iden, ScaleFactorVPV},
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b55a3b..34b405c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22134,6 +22134,27 @@ bool BoUpSLP::collectValuesToDemote(
{VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
+ if (E.isAltShuffle()) {
+ // Combining these opcodes may lead to incorrect analysis, skip for now.
+ auto IsDangerousOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ return true;
+ default:
+ break;
+ }
+ return false;
+ };
+ if (IsDangerousOpcode(E.getAltOpcode()))
+ return FinalAnalysis();
+ }
+
switch (E.getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058..9081ad7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1107,14 +1107,15 @@ public:
VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
- const Twine &Name = "");
+ const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
VPInstruction *clone() override {
SmallVector<VPValue *, 2> Operands(operands());
- auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name);
+ auto *New =
+ new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name);
if (getUnderlyingValue())
New->setUnderlyingValue(getUnderlyingInstr());
return New;
@@ -1196,7 +1197,14 @@ public:
VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
const Twine &Name = "")
- : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {}
+ : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
+ ResultTy(ResultTy) {}
+
+ VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
+ const VPIRMetadata &Metadata, const Twine &Name = "")
+ : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
+ ResultTy(ResultTy) {}
static inline bool classof(const VPRecipeBase *R) {
// VPInstructionWithType are VPInstructions with specific opcodes requiring
@@ -4109,6 +4117,12 @@ public:
const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
}
+
+ /// Return the type of the canonical IV for loop regions.
+ Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); }
+ const Type *getCanonicalIVType() const {
+ return getCanonicalIV()->getScalarType();
+ }
};
inline VPRegionBlock *VPRecipeBase::getRegion() {
@@ -4387,15 +4401,25 @@ public:
}
/// Return a VPValue wrapping i1 true.
- VPValue *getTrue() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
- }
+ VPValue *getTrue() { return getConstantInt(1, 1); }
/// Return a VPValue wrapping i1 false.
- VPValue *getFalse() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+ VPValue *getFalse() { return getConstantInt(1, 0); }
+
+ /// Return a VPValue wrapping a ConstantInt with the given type and value.
+ VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+ return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+ VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+ bool IsSigned = false) {
+ return getConstantInt(APInt(BitWidth, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+ VPValue *getConstantInt(const APInt &Val) {
+ return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
}
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3..1a66d20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
if (!RequiresScalarEpilogueCheck)
Cmp = Plan.getFalse();
else if (TailFolded)
- Cmp = Plan.getOrAddLiveIn(
- ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+ Cmp = Plan.getTrue();
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck(
// additional overflow check is required before entering the vector loop.
// Get the maximum unsigned value for the type.
- VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
- TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+ VPValue *MaxUIntTripCount =
+ Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
VPValue *DistanceToMax = Builder.createNaryOp(
Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
DebugLoc::getUnknown());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bde62dd..1a02117 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -490,10 +490,10 @@ template class VPUnrollPartAccessor<3>;
}
VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL,
- const Twine &Name)
+ const VPIRFlags &Flags, const VPIRMetadata &MD,
+ DebugLoc DL, const Twine &Name)
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
- VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
+ VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
assert(flagsValidForOpcode(getOpcode()) &&
"Set flags not supported for the provided opcode");
assert((getNumOperandsForOpcode(Opcode) == -1u ||
@@ -2372,9 +2372,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
- getScalarType() == CanIV->getScalarType();
+ getScalarType() == getRegion()->getCanonicalIVType();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 986c801..f50bf29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -699,8 +699,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
continue;
const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
- VPValue *StartV =
- Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+ VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
VPValue *StepV = PtrIV->getOperand(1);
VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -820,7 +819,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// Calculate the final index.
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
auto *CanonicalIV = LoopRegion->getCanonicalIV();
- Type *CanonicalIVType = CanonicalIV->getScalarType();
+ Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -836,7 +835,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// changed it means the exit is using the incremented value, so we need to
// add the step.
if (Incoming != WideIV) {
- VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+ VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
}
@@ -882,7 +881,7 @@ static VPValue *optimizeLatchExitInductionUser(
return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
if (ScalarTy->isPointerTy()) {
Type *StepTy = TypeInfo.inferScalarType(Step);
- auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+ auto *Zero = Plan.getConstantInt(StepTy, 0);
return B.createPtrAdd(EndValue,
B.createNaryOp(Instruction::Sub, {Zero, Step}),
DebugLoc::getUnknown(), "ind.escape");
@@ -1282,6 +1281,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ if (match(Def, m_BuildVector()) && all_equal(R.operands())) {
+ Def->replaceAllUsesWith(
+ Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
+ return;
+ }
+
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1574,9 +1579,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
continue;
// Update IV operands and comparison bound to use new narrower type.
- auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
WideIV->setStartValue(NewStart);
- auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
WideIV->setStepValue(NewStep);
auto *NewBTC = new VPWidenCastRecipe(
@@ -1695,8 +1700,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
// When using wide lane masks, the return type of the get.active.lane.mask
// intrinsic is VF x UF (last operand).
- VPValue *ALMMultiplier =
- Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+ VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
EntryALM->setOperand(2, ALMMultiplier);
LoopALM->setOperand(2, ALMMultiplier);
@@ -2402,8 +2406,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");
// Create the active lane mask instruction in the VPlan preheader.
- VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
+ VPValue *ALMMultiplier =
+ Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");
@@ -2503,7 +2507,7 @@ void VPlanTransforms::addActiveLaneMask(
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
+ ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
LaneMask =
B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2775,7 +2779,7 @@ void VPlanTransforms::addExplicitVectorLength(
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
- auto *CanIVTy = CanonicalIVPHI->getScalarType();
+ auto *CanIVTy = LoopRegion->getCanonicalIVType();
VPValue *StartV = CanonicalIVPHI->getStartValue();
// Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -2790,8 +2794,7 @@ void VPlanTransforms::addExplicitVectorLength(
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
- VPValue *AVLSafe =
- Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+ VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
"safe_avl");
@@ -2904,9 +2907,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
VPBuilder Builder(LatchExitingBr);
- VPValue *Cmp =
- Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
- Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+ VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+ Plan.getConstantInt(AVLTy, 0));
Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
LatchExitingBr->eraseFromParent();
}
@@ -2930,8 +2932,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// Only handle constant strides for now.
continue;
- auto *CI =
- Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+ auto *CI = Plan.getConstantInt(*StrideConst);
if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
@@ -2946,7 +2947,7 @@ void VPlanTransforms::replaceSymbolicStrides(
unsigned BW = U->getType()->getScalarSizeInBits();
APInt C =
isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
- VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ VPValue *CI = Plan.getConstantInt(C);
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
}
RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3125,8 +3126,7 @@ void VPlanTransforms::createInterleaveGroups(
DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
IG->getIndex(IRInsertPos),
/*IsSigned=*/true);
- VPValue *OffsetVPV =
- Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+ VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
VPBuilder B(InsertPos);
Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
}
@@ -3867,8 +3867,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
VPBuilder Builder(VectorPH, VectorPH->begin());
auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
auto *TCMO = Builder.createNaryOp(
- Instruction::Sub,
- {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+ Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
BTC->replaceAllUsesWith(TCMO);
}
@@ -3993,9 +3992,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (TailByMasking) {
TC = Builder.createNaryOp(
Instruction::Add,
- {TC, Builder.createNaryOp(
- Instruction::Sub,
- {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ {TC, Builder.createNaryOp(Instruction::Sub,
+ {Step, Plan.getConstantInt(TCTy, 1)})},
DebugLoc::getCompilerGenerated(), "n.rnd.up");
}
@@ -4017,8 +4015,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (RequiresScalarEpilogue) {
assert(!TailByMasking &&
"requiring scalar epilogue is not supported with fail folding");
- VPValue *IsZero = Builder.createICmp(
- CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ VPValue *IsZero =
+ Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
R = Builder.createSelect(IsZero, Step, R);
}
@@ -4056,7 +4054,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -4336,17 +4334,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
VPBuilder PHBuilder(Plan.getVectorPreheader());
VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
if (VF.isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
- CanIV->getScalarType(), ElementCount::getScalable(1));
+ VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
Inc->setOperand(1, VScaleUF);
Plan.getVF().replaceAllUsesWith(VScale);
} else {
Inc->setOperand(1, UF);
Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ Plan.getConstantInt(CanIV->getScalarType(), 1));
}
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index cfd1a74..d6a0028 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,10 +68,9 @@ class UnrollState {
void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
VPBasicBlock::iterator InsertPtForPhi);
- VPValue *getConstantVPV(unsigned Part) {
- Type *CanIVIntTy =
- Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
- return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+ VPValue *getConstantInt(unsigned Part) {
+ Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
+ return Plan.getConstantInt(CanIVIntTy, Part);
}
public:
@@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
remapOperands(&PartIR, Part);
if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
- ScalarIVSteps->addOperand(getConstantVPV(Part));
+ ScalarIVSteps->addOperand(getConstantInt(Part));
}
addRecipeForPart(&Part0R, &PartIR, Part);
@@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
for (unsigned Part = 1; Part != UF; ++Part)
VPV2Parts[VPI][Part - 1] = StartV;
}
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
} else {
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
"unexpected header phi recipe not needing unrolled part");
@@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
match(Copy,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
Copy->setOperand(0, R.getOperand(0));
@@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
if (LaneDefs != Def2LaneDefs.end())
return LaneDefs->second[Lane.getKnownLane()];
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
}
@@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
continue;
}
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
NewOps.push_back(Ext);
}