aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp66
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp70
-rw-r--r--llvm/lib/CodeGen/CodeGen.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineCopyPropagation.cpp68
-rw-r--r--llvm/lib/CodeGen/MachineUniformityAnalysis.cpp4
-rw-r--r--llvm/lib/CodeGen/ReachingDefAnalysis.cpp19
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp103
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp59
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp74
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp11
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td14
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td27
-rw-r--r--llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp82
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp54
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td41
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h15
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.h5
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp81
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp38
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp4
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp121
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.h50
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp19
-rw-r--r--llvm/lib/TargetParser/ARMTargetParser.cpp9
-rw-r--r--llvm/lib/Transforms/IPO/IROutliner.cpp12
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp28
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp106
43 files changed, 918 insertions, 491 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 424bb7b..dc06609 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1153,6 +1153,15 @@ InstructionCost TargetTransformInfo::getGatherScatterOpCost(
return Cost;
}
+InstructionCost TargetTransformInfo::getExpandCompressMemoryOpCost(
+ unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind, const Instruction *I) const {
+ InstructionCost Cost = TTIImpl->getExpandCompressMemoryOpCost(
+ Opcode, DataTy, VariableMask, Alignment, CostKind, I);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
InstructionCost TargetTransformInfo::getStridedMemoryOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6b61a35..55feb15 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1426,7 +1426,22 @@ static void computeKnownBitsFromOperator(const Operator *I,
computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
// Accumulate the constant indices in a separate variable
// to minimize the number of calls to computeForAddSub.
- APInt AccConstIndices(BitWidth, 0, /*IsSigned*/ true);
+ unsigned IndexWidth = Q.DL.getIndexTypeSizeInBits(I->getType());
+ APInt AccConstIndices(IndexWidth, 0);
+
+ auto AddIndexToKnown = [&](KnownBits IndexBits) {
+ if (IndexWidth == BitWidth) {
+ // Note that inbounds does *not* guarantee nsw for the addition, as only
+ // the offset is signed, while the base address is unsigned.
+ Known = KnownBits::add(Known, IndexBits);
+ } else {
+ // If the index width is smaller than the pointer width, only add the
+ // value to the low bits.
+ assert(IndexWidth < BitWidth &&
+ "Index width can't be larger than pointer width");
+ Known.insertBits(KnownBits::add(Known.trunc(IndexWidth), IndexBits), 0);
+ }
+ };
gep_type_iterator GTI = gep_type_begin(I);
for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1464,43 +1479,34 @@ static void computeKnownBitsFromOperator(const Operator *I,
break;
}
- unsigned IndexBitWidth = Index->getType()->getScalarSizeInBits();
- KnownBits IndexBits(IndexBitWidth);
- computeKnownBits(Index, IndexBits, Depth + 1, Q);
- TypeSize IndexTypeSize = GTI.getSequentialElementStride(Q.DL);
- uint64_t TypeSizeInBytes = IndexTypeSize.getKnownMinValue();
- KnownBits ScalingFactor(IndexBitWidth);
+ TypeSize Stride = GTI.getSequentialElementStride(Q.DL);
+ uint64_t StrideInBytes = Stride.getKnownMinValue();
+ if (!Stride.isScalable()) {
+ // Fast path for constant offset.
+ if (auto *CI = dyn_cast<ConstantInt>(Index)) {
+ AccConstIndices +=
+ CI->getValue().sextOrTrunc(IndexWidth) * StrideInBytes;
+ continue;
+ }
+ }
+
+ KnownBits IndexBits =
+ computeKnownBits(Index, Depth + 1, Q).sextOrTrunc(IndexWidth);
+ KnownBits ScalingFactor(IndexWidth);
// Multiply by current sizeof type.
// &A[i] == A + i * sizeof(*A[i]).
- if (IndexTypeSize.isScalable()) {
+ if (Stride.isScalable()) {
// For scalable types the only thing we know about sizeof is
// that this is a multiple of the minimum size.
- ScalingFactor.Zero.setLowBits(llvm::countr_zero(TypeSizeInBytes));
- } else if (IndexBits.isConstant()) {
- APInt IndexConst = IndexBits.getConstant();
- APInt ScalingFactor(IndexBitWidth, TypeSizeInBytes);
- IndexConst *= ScalingFactor;
- AccConstIndices += IndexConst.sextOrTrunc(BitWidth);
- continue;
+ ScalingFactor.Zero.setLowBits(llvm::countr_zero(StrideInBytes));
} else {
ScalingFactor =
- KnownBits::makeConstant(APInt(IndexBitWidth, TypeSizeInBytes));
+ KnownBits::makeConstant(APInt(IndexWidth, StrideInBytes));
}
- IndexBits = KnownBits::mul(IndexBits, ScalingFactor);
-
- // If the offsets have a different width from the pointer, according
- // to the language reference we need to sign-extend or truncate them
- // to the width of the pointer.
- IndexBits = IndexBits.sextOrTrunc(BitWidth);
-
- // Note that inbounds does *not* guarantee nsw for the addition, as only
- // the offset is signed, while the base address is unsigned.
- Known = KnownBits::add(Known, IndexBits);
- }
- if (!Known.isUnknown() && !AccConstIndices.isZero()) {
- KnownBits Index = KnownBits::makeConstant(AccConstIndices);
- Known = KnownBits::add(Known, Index);
+ AddIndexToKnown(KnownBits::mul(IndexBits, ScalingFactor));
}
+ if (!Known.isUnknown() && !AccConstIndices.isZero())
+ AddIndexToKnown(KnownBits::makeConstant(AccConstIndices));
break;
}
case Instruction::PHI: {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 2f96366..6cf05fd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -33,6 +33,7 @@
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/MC/MachineLocation.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -75,6 +76,26 @@ static dwarf::Tag GetCompileUnitType(UnitKind Kind, DwarfDebug *DW) {
return dwarf::DW_TAG_compile_unit;
}
+/// Translate NVVM IR address space code to DWARF correspondent value
+static unsigned translateToNVVMDWARFAddrSpace(unsigned AddrSpace) {
+ switch (AddrSpace) {
+ case NVPTXAS::ADDRESS_SPACE_GENERIC:
+ return NVPTXAS::DWARF_ADDR_generic_space;
+ case NVPTXAS::ADDRESS_SPACE_GLOBAL:
+ return NVPTXAS::DWARF_ADDR_global_space;
+ case NVPTXAS::ADDRESS_SPACE_SHARED:
+ return NVPTXAS::DWARF_ADDR_shared_space;
+ case NVPTXAS::ADDRESS_SPACE_CONST:
+ return NVPTXAS::DWARF_ADDR_const_space;
+ case NVPTXAS::ADDRESS_SPACE_LOCAL:
+ return NVPTXAS::DWARF_ADDR_local_space;
+ default:
+ llvm_unreachable(
+ "Cannot translate unknown address space to DWARF address space");
+ return AddrSpace;
+ }
+}
+
DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
AsmPrinter *A, DwarfDebug *DW,
DwarfFile *DWU, UnitKind Kind)
@@ -264,14 +285,11 @@ void DwarfCompileUnit::addLocationAttribute(
}
if (Expr) {
- // According to
- // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
- // cuda-gdb requires DW_AT_address_class for all variables to be able to
- // correctly interpret address space of the variable address.
+ // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace
// Decode DW_OP_constu <DWARF Address Space> DW_OP_swap DW_OP_xderef
- // sequence for the NVPTX + gdb target.
- unsigned LocalNVPTXAddressSpace;
+ // sequence to specify corresponding address space.
if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+ unsigned LocalNVPTXAddressSpace;
const DIExpression *NewExpr =
DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace);
if (NewExpr != Expr) {
@@ -363,6 +381,10 @@ void DwarfCompileUnit::addLocationAttribute(
DD->addArangeLabel(SymbolCU(this, Sym));
addOpAddress(*Loc, Sym);
}
+ if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB() &&
+ !NVPTXAddressSpace)
+ NVPTXAddressSpace =
+ translateToNVVMDWARFAddrSpace(Global->getType()->getAddressSpace());
}
// Global variables attached to symbols are memory locations.
// It would be better if this were unconditional, but malformed input that
@@ -373,13 +395,9 @@ void DwarfCompileUnit::addLocationAttribute(
DwarfExpr->addExpression(Expr);
}
if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
- // According to
- // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
- // cuda-gdb requires DW_AT_address_class for all variables to be able to
- // correctly interpret address space of the variable address.
- const unsigned NVPTX_ADDR_global_space = 5;
+ // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace
addUInt(*VariableDIE, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1,
- NVPTXAddressSpace.value_or(NVPTX_ADDR_global_space));
+ NVPTXAddressSpace.value_or(NVPTXAS::DWARF_ADDR_global_space));
}
if (Loc)
addBlock(*VariableDIE, dwarf::DW_AT_location, DwarfExpr->finalize());
@@ -793,10 +811,10 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(
const DbgValueLoc *DVal = &Single.getValueLoc();
if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB() &&
!Single.getExpr()) {
- // Lack of expression means it is a register. Registers for PTX need to
- // be marked with DW_AT_address_class = 2. See
- // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
- addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, 2);
+ // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace
+ // Lack of expression means it is a register.
+ addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1,
+ NVPTXAS::DWARF_ADDR_reg_space);
}
if (!DVal->isVariadic()) {
const DbgValueLocEntry *Entry = DVal->getLocEntries().begin();
@@ -922,14 +940,11 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(const Loc::MMI &MMI,
SmallVector<uint64_t, 8> Ops;
TRI->getOffsetOpcodes(Offset, Ops);
- // According to
- // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
- // cuda-gdb requires DW_AT_address_class for all variables to be
- // able to correctly interpret address space of the variable
- // address. Decode DW_OP_constu <DWARF Address Space> DW_OP_swap
- // DW_OP_xderef sequence for the NVPTX + gdb target.
- unsigned LocalNVPTXAddressSpace;
+ // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace.
+ // Decode DW_OP_constu <DWARF Address Space> DW_OP_swap
+ // DW_OP_xderef sequence to specify address space.
if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+ unsigned LocalNVPTXAddressSpace;
const DIExpression *NewExpr =
DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace);
if (NewExpr != Expr) {
@@ -949,14 +964,9 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(const Loc::MMI &MMI,
DwarfExpr.addExpression(std::move(Cursor));
}
if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
- // According to
- // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
- // cuda-gdb requires DW_AT_address_class for all variables to be
- // able to correctly interpret address space of the variable
- // address.
- const unsigned NVPTX_ADDR_local_space = 6;
+ // cuda-gdb special requirement. See NVPTXAS::DWARF_AddressSpace.
addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1,
- NVPTXAddressSpace.value_or(NVPTX_ADDR_local_space));
+ NVPTXAddressSpace.value_or(NVPTXAS::DWARF_ADDR_local_space));
}
addBlock(VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
if (DwarfExpr.TagOffset)
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 5f0c7ec9c..0a7937e 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -77,7 +77,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeMachineCFGPrinterPass(Registry);
initializeMachineCSELegacyPass(Registry);
initializeMachineCombinerPass(Registry);
- initializeMachineCopyPropagationPass(Registry);
+ initializeMachineCopyPropagationLegacyPass(Registry);
initializeMachineCycleInfoPrinterPassPass(Registry);
initializeMachineCycleInfoWrapperPassPass(Registry);
initializeMachineDominatorTreeWrapperPassPass(Registry);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 3e43299..362d856 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2441,9 +2441,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
return true;
}
case Intrinsic::invariant_start: {
- LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
- Register Undef = MRI->createGenericVirtualRegister(PtrTy);
- MIRBuilder.buildUndef(Undef);
+ MIRBuilder.buildUndef(getOrCreateVReg(CI));
return true;
}
case Intrinsic::invariant_end:
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index d44b064..460749a 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -48,6 +48,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineCopyPropagation.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
@@ -449,7 +450,7 @@ public:
}
};
-class MachineCopyPropagation : public MachineFunctionPass {
+class MachineCopyPropagation {
const TargetRegisterInfo *TRI = nullptr;
const TargetInstrInfo *TII = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -458,24 +459,10 @@ class MachineCopyPropagation : public MachineFunctionPass {
bool UseCopyInstr;
public:
- static char ID; // Pass identification, replacement for typeid
-
MachineCopyPropagation(bool CopyInstr = false)
- : MachineFunctionPass(ID), UseCopyInstr(CopyInstr || MCPUseCopyInstr) {
- initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
+ : UseCopyInstr(CopyInstr || MCPUseCopyInstr) {}
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
+ bool run(MachineFunction &MF);
private:
typedef enum { DebugUse = false, RegularUse = true } DebugType;
@@ -510,13 +497,35 @@ private:
bool Changed = false;
};
+class MachineCopyPropagationLegacy : public MachineFunctionPass {
+ bool UseCopyInstr;
+
+public:
+ static char ID; // pass identification
+
+ MachineCopyPropagationLegacy(bool UseCopyInstr = false)
+ : MachineFunctionPass(ID), UseCopyInstr(UseCopyInstr) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+};
+
} // end anonymous namespace
-char MachineCopyPropagation::ID = 0;
+char MachineCopyPropagationLegacy::ID = 0;
-char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
+char &llvm::MachineCopyPropagationID = MachineCopyPropagationLegacy::ID;
-INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
+INITIALIZE_PASS(MachineCopyPropagationLegacy, DEBUG_TYPE,
"Machine Copy Propagation Pass", false, false)
void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader,
@@ -1563,10 +1572,25 @@ void MachineCopyPropagation::EliminateSpillageCopies(MachineBasicBlock &MBB) {
Tracker.clear();
}
-bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
+bool MachineCopyPropagationLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
+ return MachineCopyPropagation(UseCopyInstr).run(MF);
+}
+
+PreservedAnalyses
+MachineCopyPropagationPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &) {
+ MFPropsModifier _(*this, MF);
+ if (!MachineCopyPropagation(UseCopyInstr).run(MF))
+ return PreservedAnalyses::all();
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+bool MachineCopyPropagation::run(MachineFunction &MF) {
bool isSpillageCopyElimEnabled = false;
switch (EnableSpillageCopyElimination) {
case cl::BOU_UNSET:
@@ -1599,5 +1623,5 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
MachineFunctionPass *
llvm::createMachineCopyPropagationPass(bool UseCopyInstr = false) {
- return new MachineCopyPropagation(UseCopyInstr);
+ return new MachineCopyPropagationLegacy(UseCopyInstr);
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index a4b78c1..b5dc487 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -185,11 +185,11 @@ MachineUniformityAnalysisPass::MachineUniformityAnalysisPass()
}
INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity",
- "Machine Uniformity Info Analysis", true, true)
+ "Machine Uniformity Info Analysis", false, true)
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
- "Machine Uniformity Info Analysis", true, true)
+ "Machine Uniformity Info Analysis", false, true)
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index fa60881..59ad9ff 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -147,16 +147,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!");
if (!isFIDef(*MI, FrameIndex, TII))
continue;
- if (MBBFrameObjsReachingDefs.contains(MBBNumber)) {
- auto Frame2InstrIdx = MBBFrameObjsReachingDefs[MBBNumber];
- if (Frame2InstrIdx.count(FrameIndex - ObjectIndexBegin) > 0)
- Frame2InstrIdx[FrameIndex - ObjectIndexBegin].push_back(CurInstr);
- else
- Frame2InstrIdx[FrameIndex - ObjectIndexBegin] = {CurInstr};
- } else {
- MBBFrameObjsReachingDefs[MBBNumber] = {
- {FrameIndex - ObjectIndexBegin, {CurInstr}}};
- }
+ MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr);
}
if (!isValidRegDef(MO))
continue;
@@ -351,9 +342,13 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, Register Reg) const {
int LatestDef = ReachingDefDefaultVal;
if (Reg.isStack()) {
+ // Check that there was a reaching def.
int FrameIndex = Reg.stackSlotIndex();
- for (int Def : MBBFrameObjsReachingDefs.lookup(MBBNumber).lookup(
- FrameIndex - ObjectIndexBegin)) {
+ auto Lookup = MBBFrameObjsReachingDefs.find({MBBNumber, FrameIndex});
+ if (Lookup == MBBFrameObjsReachingDefs.end())
+ return LatestDef;
+ auto &Defs = Lookup->second;
+ for (int Def : Defs) {
if (Def >= InstId)
break;
DefRes = Def;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 882d6015..8858c20 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -385,17 +385,6 @@ namespace {
bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
- /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
- /// load.
- ///
- /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
- /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
- /// \param EltNo index of the vector element to load.
- /// \param OriginalLoad load that EVE came from to be replaced.
- /// \returns EVE on success SDValue() on failure.
- SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
- SDValue EltNo,
- LoadSDNode *OriginalLoad);
void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
@@ -22719,81 +22708,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
return SDValue();
}
-SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
- SDValue EltNo,
- LoadSDNode *OriginalLoad) {
- assert(OriginalLoad->isSimple());
-
- EVT ResultVT = EVE->getValueType(0);
- EVT VecEltVT = InVecVT.getVectorElementType();
-
- // If the vector element type is not a multiple of a byte then we are unable
- // to correctly compute an address to load only the extracted element as a
- // scalar.
- if (!VecEltVT.isByteSized())
- return SDValue();
-
- ISD::LoadExtType ExtTy =
- ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
- if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
- !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
- return SDValue();
-
- Align Alignment = OriginalLoad->getAlign();
- MachinePointerInfo MPI;
- SDLoc DL(EVE);
- if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
- int Elt = ConstEltNo->getZExtValue();
- unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
- MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
- Alignment = commonAlignment(Alignment, PtrOff);
- } else {
- // Discard the pointer info except the address space because the memory
- // operand can't represent this new access since the offset is variable.
- MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
- Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
- }
-
- unsigned IsFast = 0;
- if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
- OriginalLoad->getAddressSpace(), Alignment,
- OriginalLoad->getMemOperand()->getFlags(),
- &IsFast) ||
- !IsFast)
- return SDValue();
-
- SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
- InVecVT, EltNo);
-
- // We are replacing a vector load with a scalar load. The new load must have
- // identical memory op ordering to the original.
- SDValue Load;
- if (ResultVT.bitsGT(VecEltVT)) {
- // If the result type of vextract is wider than the load, then issue an
- // extending load instead.
- ISD::LoadExtType ExtType =
- TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD
- : ISD::EXTLOAD;
- Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
- NewPtr, MPI, VecEltVT, Alignment,
- OriginalLoad->getMemOperand()->getFlags(),
- OriginalLoad->getAAInfo());
- DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
- } else {
- // The result type is narrower or the same width as the vector element
- Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
- Alignment, OriginalLoad->getMemOperand()->getFlags(),
- OriginalLoad->getAAInfo());
- DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
- if (ResultVT.bitsLT(VecEltVT))
- Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
- else
- Load = DAG.getBitcast(ResultVT, Load);
- }
- ++OpsNarrowed;
- return Load;
-}
-
/// Transform a vector binary operation into a scalar binary operation by moving
/// the math/logic after an extract element of a vector.
static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
@@ -23272,8 +23186,13 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
ISD::isNormalLoad(VecOp.getNode()) &&
!Index->hasPredecessor(VecOp.getNode())) {
auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
- if (VecLoad && VecLoad->isSimple())
- return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
+ if (VecLoad && VecLoad->isSimple()) {
+ if (SDValue Scalarized = TLI.scalarizeExtractedVectorLoad(
+ ExtVT, SDLoc(N), VecVT, Index, VecLoad, DAG)) {
+ ++OpsNarrowed;
+ return Scalarized;
+ }
+ }
}
// Perform only after legalization to ensure build_vector / vector_shuffle
@@ -23361,7 +23280,13 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
if (Elt == -1)
return DAG.getUNDEF(LVT);
- return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
+ if (SDValue Scalarized =
+ TLI.scalarizeExtractedVectorLoad(LVT, DL, VecVT, Index, LN0, DAG)) {
+ ++OpsNarrowed;
+ return Scalarized;
+ }
+
+ return SDValue();
}
// Simplify (build_vec (ext )) to (bitcast (build_vec ))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 625052b..f1a91a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -566,6 +566,29 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
}
}
+ // TODO: Handle big endian
+ if (!NOutVT.isVector() && InOp.getValueType().isVector() &&
+ DAG.getDataLayout().isLittleEndian()) {
+ // Pad the vector operand with undef and cast to a wider integer.
+ EVT EltVT = InOp.getValueType().getVectorElementType();
+ TypeSize EltSize = EltVT.getSizeInBits();
+ TypeSize OutSize = NOutVT.getSizeInBits();
+
+ if (OutSize.hasKnownScalarFactor(EltSize)) {
+ unsigned NumEltsWithPadding = OutSize.getKnownScalarFactor(EltSize);
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);
+
+ if (isTypeLegal(WideVecVT)) {
+ SDValue Inserted = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
+ DAG.getUNDEF(WideVecVT), InOp,
+ DAG.getVectorIdxConstant(0, dl));
+
+ return DAG.getNode(ISD::BITCAST, dl, NOutVT, Inserted);
+ }
+ }
+ }
+
return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
CreateStackStoreLoad(InOp, OutVT));
}
@@ -2181,9 +2204,43 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+ EVT OutVT = N->getValueType(0);
+ SDValue InOp = N->getOperand(0);
+ EVT InVT = InOp.getValueType();
+ EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+ SDLoc dl(N);
+
+ switch (getTypeAction(InVT)) {
+ case TargetLowering::TypePromoteInteger: {
+ // TODO: Handle big endian
+ if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) {
+ EVT EltVT = OutVT.getVectorElementType();
+ TypeSize EltSize = EltVT.getSizeInBits();
+ TypeSize NInSize = NInVT.getSizeInBits();
+
+ if (NInSize.hasKnownScalarFactor(EltSize)) {
+ unsigned NumEltsWithPadding = NInSize.getKnownScalarFactor(EltSize);
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);
+
+ if (isTypeLegal(WideVecVT)) {
+ SDValue Promoted = GetPromotedInteger(InOp);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Promoted);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, Cast,
+ DAG.getVectorIdxConstant(0, dl));
+ }
+ }
+ }
+
+ break;
+ }
+ default:
+ break;
+ }
+
// This should only occur in unusual situations like bitcasting to an
// x86_fp80, so just turn it into a store+load
- return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+ return CreateStackStoreLoad(InOp, OutVT);
}
SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 98206b7..adfb960 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12114,3 +12114,77 @@ SDValue TargetLowering::expandVectorNaryOpBySplitting(SDNode *Node,
SDValue SplitOpHi = DAG.getNode(Opcode, DL, HiVT, HiOps);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SplitOpLo, SplitOpHi);
}
+
+SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
+ const SDLoc &DL,
+ EVT InVecVT, SDValue EltNo,
+ LoadSDNode *OriginalLoad,
+ SelectionDAG &DAG) const {
+ assert(OriginalLoad->isSimple());
+
+ EVT VecEltVT = InVecVT.getVectorElementType();
+
+ // If the vector element type is not a multiple of a byte then we are unable
+ // to correctly compute an address to load only the extracted element as a
+ // scalar.
+ if (!VecEltVT.isByteSized())
+ return SDValue();
+
+ ISD::LoadExtType ExtTy =
+ ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
+ if (!isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
+ !shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+ return SDValue();
+
+ Align Alignment = OriginalLoad->getAlign();
+ MachinePointerInfo MPI;
+ if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+ int Elt = ConstEltNo->getZExtValue();
+ unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+ MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+ Alignment = commonAlignment(Alignment, PtrOff);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
+ Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
+ }
+
+ unsigned IsFast = 0;
+ if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
+ OriginalLoad->getAddressSpace(), Alignment,
+ OriginalLoad->getMemOperand()->getFlags(), &IsFast) ||
+ !IsFast)
+ return SDValue();
+
+ SDValue NewPtr =
+ getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
+
+ // We are replacing a vector load with a scalar load. The new load must have
+ // identical memory op ordering to the original.
+ SDValue Load;
+ if (ResultVT.bitsGT(VecEltVT)) {
+ // If the result type of vextract is wider than the load, then issue an
+ // extending load instead.
+ ISD::LoadExtType ExtType = isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT)
+ ? ISD::ZEXTLOAD
+ : ISD::EXTLOAD;
+ Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
+ NewPtr, MPI, VecEltVT, Alignment,
+ OriginalLoad->getMemOperand()->getFlags(),
+ OriginalLoad->getAAInfo());
+ DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
+ } else {
+ // The result type is narrower or the same width as the vector element
+ Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
+ Alignment, OriginalLoad->getMemOperand()->getFlags(),
+ OriginalLoad->getAAInfo());
+ DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
+ if (ResultVT.bitsLT(VecEltVT))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
+ else
+ Load = DAG.getBitcast(ResultVT, Load);
+ }
+
+ return Load;
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 8bf5135..107e79c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -967,21 +967,20 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() {
// here because we validate this in the .debug_info verifier.
continue;
}
- auto Iter = StmtListToDie.find(LineTableOffset);
- if (Iter != StmtListToDie.end()) {
+ auto [Iter, Inserted] = StmtListToDie.try_emplace(LineTableOffset, Die);
+ if (!Inserted) {
++NumDebugLineErrors;
+ const auto &OldDie = Iter->second;
ErrorCategory.Report("Identical DW_AT_stmt_list section offset", [&]() {
error() << "two compile unit DIEs, "
- << format("0x%08" PRIx64, Iter->second.getOffset()) << " and "
+ << format("0x%08" PRIx64, OldDie.getOffset()) << " and "
<< format("0x%08" PRIx64, Die.getOffset())
<< ", have the same DW_AT_stmt_list section offset:\n";
- dump(Iter->second);
+ dump(OldDie);
dump(Die) << '\n';
});
// Already verified this line table before, no need to do it again.
- continue;
}
- StmtListToDie[LineTableOffset] = Die;
}
}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index d9096ed..176caa2 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -110,6 +110,7 @@
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineCSE.h"
+#include "llvm/CodeGen/MachineCopyPropagation.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/CodeGen/MachineLICM.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 84f6d42..8617377 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
-// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
-// input operands are copy nodes where the source register is in a
-// StridedOrContiguous class. For example:
-//
-// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
-// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
-// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
-// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
-// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
-// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
-// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
-//
-bool shouldUseFormStridedPseudo(MachineInstr &MI) {
- MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
-
- assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
- "Unexpected opcode.");
-
- MCRegister SubReg = MCRegister::NoRegister;
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MachineOperand &MO = MI.getOperand(I);
- assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
-
- MachineOperand *Def = MRI.getOneDef(MO.getReg());
- if (!Def || !Def->getParent()->isCopy())
- return false;
-
- const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
- unsigned OpSubReg = CopySrc.getSubReg();
- if (SubReg == MCRegister::NoRegister)
- SubReg = OpSubReg;
-
- MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
- const TargetRegisterClass *CopySrcClass =
- MRI.getRegClass(CopySrcOp->getReg());
- if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
- (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
- CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
- return false;
- }
-
- return true;
-}
-
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
- if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
- // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
- // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
- if (shouldUseFormStridedPseudo(MI))
- return;
-
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(TargetOpcode::REG_SEQUENCE),
- MI.getOperand(0).getReg());
-
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MIB.add(MI.getOperand(I));
- MIB.addImm(AArch64::zsub0 + (I - 1));
- }
-
- MI.eraseFromParent();
- return;
- }
-
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3c57ba4..a0928b9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -428,7 +428,6 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
-
def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
@@ -451,6 +450,7 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
+def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
@@ -817,11 +817,9 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
-def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;
-
-def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
-def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
-def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64Rev>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64Rev>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64Rev>;
def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
@@ -3000,8 +2998,8 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
-def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
-def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
+def : Pat<(AArch64rev16 GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
+def : Pat<(AArch64rev16 GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index aae2fda..a6edcf1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -940,6 +940,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_cttz_elts: {
+ EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
+ // This will consist of a SVE brkb and a cntp instruction. These
+ // typically have the same latency and half the throughput as a vector
+ // add instruction.
+ return 4;
+ }
+ break;
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 0ac131e..4f6a413 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -36,27 +36,26 @@ let WantsRoot = true in
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
-// improve register allocation for intrinsics which use strided and contiguous
-// multi-vector registers, avoiding unnecessary copies.
-// If the operands of the pseudo are copies where the source register is in
-// the StridedOrContiguous class, the pseudo is used to provide a hint to the
-// register allocator suggesting a contigious multi-vector register which
-// matches the subregister sequence used by the operands.
-// If the operands do not match this pattern, the pseudos are expanded
-// to a REG_SEQUENCE using the post-isel hook.
+// improve register allocation for intrinsics which use strided and
+// contiguous multi-vector registers, avoiding unnecessary copies.
+// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the
+// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the
+// source register is in the StridedOrContiguous class. The operands in the
+// sequence must all have the same subreg index.
+// The pseudo is then used to provide a hint to the register allocator
+// suggesting a contigious multi-vector register which matches the
+// subregister sequence used by the operands.
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
- let hasPostISelHook = 1;
}
def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
- let hasPostISelHook = 1;
}
def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
@@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O
class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
zpr_ty:$Zm)>;
class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm)>;
class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
@@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm, imm_ty:$i)>;
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 4a0312d..2ffd4d7 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass {
bool optimizeStartStopPairs(MachineBasicBlock &MBB,
bool &HasRemovedAllSMChanges) const;
+ bool visitRegSequence(MachineInstr &MI);
};
char SMEPeepholeOpt::ID = 0;
@@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
return Changed;
}
+// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation
+// of multi-vector intrinsics. However, the psuedo should only be emitted if
+// the input registers of the REG_SEQUENCE are copy nodes where the source
+// register is in a StridedOrContiguous class. For example:
+//
+// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1
+//
+// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+//
+bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) {
+ assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!");
+
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) {
+ case AArch64::ZPR2RegClassID:
+ case AArch64::ZPR4RegClassID:
+ case AArch64::ZPR2Mul2RegClassID:
+ case AArch64::ZPR4Mul4RegClassID:
+ break;
+ default:
+ return false;
+ }
+
+ // The first operand is the register class created by the REG_SEQUENCE.
+ // Each operand pair after this consists of a vreg + subreg index, so
+ // for example a sequence of 2 registers will have a total of 5 operands.
+ if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9)
+ return false;
+
+ MCRegister SubReg = MCRegister::NoRegister;
+ for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
+ MachineOperand &MO = MI.getOperand(I);
+
+ MachineOperand *Def = MRI.getOneDef(MO.getReg());
+ if (!Def || !Def->getParent()->isCopy())
+ return false;
+
+ const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
+ unsigned OpSubReg = CopySrc.getSubReg();
+ if (SubReg == MCRegister::NoRegister)
+ SubReg = OpSubReg;
+
+ MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
+ if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
+ CopySrcOp->getReg().isPhysical())
+ return false;
+
+ const TargetRegisterClass *CopySrcClass =
+ MRI.getRegClass(CopySrcOp->getReg());
+ if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
+ CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)
+ return false;
+ }
+
+ unsigned Opc = MI.getNumOperands() == 5
+ ? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO
+ : AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
+
+ const TargetInstrInfo *TII =
+ MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(Opc), MI.getOperand(0).getReg());
+ for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
+ MIB.addReg(MI.getOperand(I).getReg());
+
+ MI.eraseFromParent();
+ return true;
+}
+
INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt",
"SME Peephole Optimization", false, false)
@@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
bool BlockHasAllSMChangesRemoved;
Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved);
FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved;
+
+ if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) {
+ for (MachineInstr &MI : make_early_inc_range(MBB))
+ if (MI.getOpcode() == AArch64::REG_SEQUENCE)
+ Changed |= visitRegSequence(MI);
+ }
}
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cca9fa7..792e17e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
- if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
- SDValue BV = stripBitcast(Src.getOperand(0));
- if (BV.getOpcode() == ISD::BUILD_VECTOR &&
- BV.getValueType().getVectorNumElements() == 2) {
- SDValue SrcElt = BV.getOperand(1);
- EVT SrcEltVT = SrcElt.getValueType();
- if (SrcEltVT.isFloatingPoint()) {
- SrcElt = DAG.getNode(ISD::BITCAST, SL,
- SrcEltVT.changeTypeToInteger(), SrcElt);
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcEltVT = BV.getOperand(0).getValueType();
+ unsigned SrcEltSize = SrcEltVT.getSizeInBits();
+ unsigned BitIndex = K->getZExtValue();
+ unsigned PartIndex = BitIndex / SrcEltSize;
+
+ if (PartIndex * SrcEltSize == BitIndex &&
+ PartIndex < BV.getNumOperands()) {
+ if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
+ SDValue SrcElt =
+ DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
+ BV.getOperand(PartIndex));
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
-
- return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5bfd891..09f7877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
return 1024;
}
-// FIXME: Should we use narrower types for local/region, or account for when
-// unaligned access is legal?
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
- // hardware into byte accesses. If you assume all alignments are equally
- // probable, it's more efficient on average to use short accesses for this
- // case.
- if (MinAlign == Align(2))
- return Type::getInt16Ty(Context);
-
- // Not all subtargets have 128-bit DS instructions, and we currently don't
- // form them by default.
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
- return FixedVectorType::get(Type::getInt32Ty(Context), 2);
- }
-
- // Global memory works best with 16-byte accesses.
+ // 16-byte accesses achieve the highest copy throughput.
// If the operation has a fixed known length that is large enough, it is
// worthwhile to return an even wider type and let legalization lower it into
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
- // also hits this, although accesses may be decomposed.
+ // multiple accesses, effectively unrolling the memcpy loop.
+ // We also rely on legalization to decompose into smaller accesses for
+ // subtargets and address spaces where it is necessary.
//
// Don't unroll if Length is not a constant, since unrolling leads to worse
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- if (MinAlign != Align(2)) {
- Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
- while (RemainingBytes >= 16) {
- OpsOut.push_back(I32x4Ty);
- RemainingBytes -= 16;
- }
+ Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+ while (RemainingBytes >= 16) {
+ OpsOut.push_back(I32x4Ty);
+ RemainingBytes -= 16;
+ }
- Type *I64Ty = Type::getInt64Ty(Context);
- while (RemainingBytes >= 8) {
- OpsOut.push_back(I64Ty);
- RemainingBytes -= 8;
- }
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
- Type *I32Ty = Type::getInt32Ty(Context);
- while (RemainingBytes >= 4) {
- OpsOut.push_back(I32Ty);
- RemainingBytes -= 4;
- }
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
}
Type *I16Ty = Type::getInt16Ty(Context);
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index a20319e..ac11526 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
- if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
- return false;
std::vector<MachineInstr *> &MIs =
PreviousRegSeqByUndefCount[NeededUndefs];
+ if (MIs.empty())
+ return false;
CompatibleRSI = PreviousRegSeq[MIs.back()];
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bee4c47..6e08aff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = NotHasTrue16BitInsts in {
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [NotHasTrue16BitInsts]
+} // end True16Predicate = NotHasTrue16BitInsts
+
+let True16Predicate = UseRealTrue16Insts in {
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+} // end True16Predicate = UseRealTrue16BitInsts
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseFakeTrue16Insts in {
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [HasTrue16BitInsts]
+} // end True16Predicate = UseFakeTrue16BitInsts
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
@@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>
def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts in {
def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
@@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in {
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fad7e67..67bebfb3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -976,8 +976,7 @@ struct Waitcnt {
Waitcnt() = default;
// Pre-gfx12 constructor.
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
- : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt),
- SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {}
+ : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {}
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 1e76bf7..296031e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -27,6 +27,28 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+// Return the PAL metadata hardware shader stage name.
+static const char *getStageName(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return ".ps";
+ case CallingConv::AMDGPU_VS:
+ return ".vs";
+ case CallingConv::AMDGPU_GS:
+ return ".gs";
+ case CallingConv::AMDGPU_ES:
+ return ".es";
+ case CallingConv::AMDGPU_HS:
+ return ".hs";
+ case CallingConv::AMDGPU_LS:
+ return ".ls";
+ case CallingConv::AMDGPU_Gfx:
+ llvm_unreachable("Callable shader has no hardware stage");
+ default:
+ return ".cs";
+ }
+}
+
// Read the PAL metadata from IR metadata, where it was put by the frontend.
void AMDGPUPALMetadata::readFromIR(Module &M) {
auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack");
@@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
if (isLegacy())
return;
// Msgpack format.
+ // Entry point is updated to .entry_point_symbol and is set to the function
+ // name
getHwStage(CC)[".entry_point_symbol"] =
MsgPackDoc.getNode(Name, /*Copy=*/true);
+
+ // Set .entry_point which is defined
+ // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions
+ SmallString<16> EPName("_amdgpu_");
+ raw_svector_ostream EPNameOS(EPName);
+ EPNameOS << getStageName(CC) + 1;
+ getHwStage(CC)[".entry_point"] =
+ MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true);
}
// Set the number of used vgprs in the metadata. This is an optional
@@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
return GraphicsRegisters.getMap();
}
-// Return the PAL metadata hardware shader stage name.
-static const char *getStageName(CallingConv::ID CC) {
- switch (CC) {
- case CallingConv::AMDGPU_PS:
- return ".ps";
- case CallingConv::AMDGPU_VS:
- return ".vs";
- case CallingConv::AMDGPU_GS:
- return ".gs";
- case CallingConv::AMDGPU_ES:
- return ".es";
- case CallingConv::AMDGPU_HS:
- return ".hs";
- case CallingConv::AMDGPU_LS:
- return ".ls";
- case CallingConv::AMDGPU_Gfx:
- llvm_unreachable("Callable shader has no hardware stage");
- default:
- return ".cs";
- }
-}
-
msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
auto &N =
MsgPackDoc.getRoot()
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 633a99d..74def43 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
+def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
def True : Predicate<"true">;
def False : Predicate<"false">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 56d8b73..a0d00e4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT :
Requires<[hasSM<90>, hasPTX<78>]>;
def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
+
+// Tcgen05 intrinsics
+let isConvergent = true in {
+
+multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs),
+ (ins rc:$dst, Int32Regs:$ncols),
+ !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"),
+ [(Intr rc:$dst, Int32Regs:$ncols)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+
+defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>;
+defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>;
+
+defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+
+defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+
+multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs),
+ (ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
+ !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"),
+ [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>;
+defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
+
+multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
+ def NAME : NVPTXInst<(outs), (ins),
+ !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"),
+ [(Intr)]>,
+ Requires<[hasTcgen05Instructions]>;
+}
+defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
+defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
+
+} // isConvergent
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 919f487..0c4420b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,21 @@ public:
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ // Tcgen05 instructions in Blackwell family
+ bool hasTcgen05Instructions() const {
+ bool HasTcgen05 = false;
+ switch (FullSmVersion) {
+ default:
+ break;
+ case 1001: // sm_100a
+ case 1011: // sm_101a
+ HasTcgen05 = true;
+ break;
+ }
+
+ return HasTcgen05 && PTXVersion >= 86;
+ }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e88027f..f2afa6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
else if (UseShortPointers)
Ret += "-p3:32:32-p4:32:32-p5:32:32";
+ // Tensor Memory (addrspace:6) is always 32-bits.
+ Ret += "-p6:32:32";
+
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
return Ret;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 36b8a24..274c7cb 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -65,9 +65,10 @@ public:
/// for a given imm form load/store opcode \p ImmFormOpcode.
/// FIXME: move this to PPCInstrInfo class.
unsigned getMappedIdxOpcForImmOpc(unsigned ImmOpcode) const {
- if (!ImmToIdxMap.count(ImmOpcode))
+ auto It = ImmToIdxMap.find(ImmOpcode);
+ if (It == ImmToIdxMap.end())
return PPC::INSTRUCTION_LIST_END;
- return ImmToIdxMap.find(ImmOpcode)->second;
+ return It->second;
}
/// getPointerRegClass - Return the register class to use to hold pointers.
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index ad06f47..98e05b7 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -42,6 +42,8 @@ def CSR_ILP32D_LP64D_V
// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
+def CSR_IPRA : CalleeSavedRegs<(add X1)>;
+
// Interrupt handler needs to save/restore all registers that are used,
// both Caller and Callee saved registers.
def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 5, 31))>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8e3caf5..7c3b583 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17759,6 +17759,83 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(Pop, DL, VT);
}
+static SDValue performSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ // (shl (zext x), y) -> (vwsll x, y)
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ return V;
+
+ // (shl (sext x), C) -> (vwmulsu x, 1u << C)
+ // (shl (zext x), C) -> (vwmulu x, 1u << C)
+
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ if (!LHS.hasOneUse())
+ return SDValue();
+ unsigned Opcode;
+ switch (LHS.getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ case RISCVISD::VSEXT_VL:
+ Opcode = RISCVISD::VWMULSU_VL;
+ break;
+ case ISD::ZERO_EXTEND:
+ case RISCVISD::VZEXT_VL:
+ Opcode = RISCVISD::VWMULU_VL;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue RHS = N->getOperand(1);
+ APInt ShAmt;
+ uint64_t ShAmtInt;
+ if (ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
+ ShAmtInt = ShAmt.getZExtValue();
+ else if (RHS.getOpcode() == RISCVISD::VMV_V_X_VL &&
+ RHS.getOperand(1).getOpcode() == ISD::Constant)
+ ShAmtInt = RHS.getConstantOperandVal(1);
+ else
+ return SDValue();
+
+ // Better foldings:
+ // (shl (sext x), 1) -> (vwadd x, x)
+ // (shl (zext x), 1) -> (vwaddu x, x)
+ if (ShAmtInt <= 1)
+ return SDValue();
+
+ SDValue NarrowOp = LHS.getOperand(0);
+ MVT NarrowVT = NarrowOp.getSimpleValueType();
+ uint64_t NarrowBits = NarrowVT.getScalarSizeInBits();
+ if (ShAmtInt >= NarrowBits)
+ return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+ if (NarrowBits * 2 != VT.getScalarSizeInBits())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue Passthru, Mask, VL;
+ switch (N->getOpcode()) {
+ case ISD::SHL:
+ Passthru = DAG.getUNDEF(VT);
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+ break;
+ case RISCVISD::SHL_VL:
+ Passthru = N->getOperand(2);
+ Mask = N->getOperand(3);
+ VL = N->getOperand(4);
+ break;
+ default:
+ llvm_unreachable("Expected SHL");
+ }
+ return DAG.getNode(Opcode, DL, VT, NarrowOp,
+ DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT),
+ Passthru, Mask, VL);
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -18392,7 +18469,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::SHL_VL:
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = performSHLCombine(N, DCI, Subtarget))
return V;
[[fallthrough]];
case RISCVISD::SRA_VL:
@@ -18417,7 +18494,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL:
case ISD::SHL: {
if (N->getOpcode() == ISD::SHL) {
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = performSHLCombine(N, DCI, Subtarget))
return V;
}
SDValue ShAmt = N->getOperand(1);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a5269..7a99bfd 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -56,6 +56,11 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
/*PC*/0, HwMode) {}
const MCPhysReg *
+RISCVRegisterInfo::getIPRACSRegs(const MachineFunction *MF) const {
+ return CSR_IPRA_SaveList;
+}
+
+const MCPhysReg *
RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694..6c4e9c7 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -62,6 +62,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const override;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fa7c7c5..cb2ec1d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -940,6 +940,44 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
return NumLoads * MemOpCost;
}
+InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(
+ unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind, const Instruction *I) {
+ bool IsLegal = (Opcode == Instruction::Store &&
+ isLegalMaskedCompressStore(DataTy, Alignment)) ||
+ (Opcode == Instruction::Load &&
+ isLegalMaskedExpandLoad(DataTy, Alignment));
+ if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
+ Alignment, CostKind, I);
+ // Example compressstore sequence:
+ // vsetivli zero, 8, e32, m2, ta, ma (ignored)
+ // vcompress.vm v10, v8, v0
+ // vcpop.m a1, v0
+ // vsetvli zero, a1, e32, m2, ta, ma
+ // vse32.v v10, (a0)
+ // Example expandload sequence:
+ // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
+ // vcpop.m a1, v0
+ // vsetvli zero, a1, e32, m2, ta, ma
+ // vle32.v v10, (a0)
+ // vsetivli zero, 8, e32, m2, ta, ma
+ // viota.m v12, v0
+ // vrgather.vv v8, v10, v12, v0.t
+ auto MemOpCost =
+ getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
+ auto LT = getTypeLegalizationCost(DataTy);
+ SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
+ if (VariableMask)
+ Opcodes.push_back(RISCV::VCPOP_M);
+ if (Opcode == Instruction::Store)
+ Opcodes.append({RISCV::VCOMPRESS_VM});
+ else
+ Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
+ return MemOpCost +
+ LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+}
+
InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 042530b..5389e9b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -174,6 +174,12 @@ public:
TTI::TargetCostKind CostKind,
const Instruction *I);
+ InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src,
+ bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 78db841..c202f7f 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// Adjust stack pointer.
int StackAdj = StackAdjust.getImm();
int MaxTCDelta = X86FI->getTCReturnAddrDelta();
- int Offset = 0;
+ int64_t Offset = 0;
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
// Incoporate the retaddr area.
@@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
- Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+ Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true);
X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a15db03..50c56c9 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
return false;
}
+constexpr int64_t MaxSPChunk = (1LL << 31) - 1;
+
/// emitSPUpdate - Emit a series of instructions to increment / decrement the
/// stack pointer by a constant value.
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
@@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
return;
}
- uint64_t Chunk = (1LL << 31) - 1;
+ uint64_t Chunk = MaxSPChunk;
MachineFunction &MF = *MBB.getParent();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
@@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
return MI;
}
-int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- bool doMergeWithPrevious) const {
+template <typename FoundT, typename CalcT>
+int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ FoundT FoundStackAdjust,
+ CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const {
if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
(!doMergeWithPrevious && MBBI == MBB.end()))
- return 0;
+ return CalcNewOffset(0);
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
@@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
PI = std::prev(PI);
- unsigned Opc = PI->getOpcode();
- int Offset = 0;
-
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
- PI->getOperand(0).getReg() == StackPtr) {
- assert(PI->getOperand(1).getReg() == StackPtr);
- Offset = PI->getOperand(2).getImm();
- } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
- PI->getOperand(0).getReg() == StackPtr &&
- PI->getOperand(1).getReg() == StackPtr &&
- PI->getOperand(2).getImm() == 1 &&
- PI->getOperand(3).getReg() == X86::NoRegister &&
- PI->getOperand(5).getReg() == X86::NoRegister) {
- // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
- Offset = PI->getOperand(4).getImm();
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
- PI->getOperand(0).getReg() == StackPtr) {
- assert(PI->getOperand(1).getReg() == StackPtr);
- Offset = -PI->getOperand(2).getImm();
- } else
- return 0;
+ int64_t Offset = 0;
+ for (;;) {
+ unsigned Opc = PI->getOpcode();
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = PI->getOperand(2).getImm();
+ } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr &&
+ PI->getOperand(1).getReg() == StackPtr &&
+ PI->getOperand(2).getImm() == 1 &&
+ PI->getOperand(3).getReg() == X86::NoRegister &&
+ PI->getOperand(5).getReg() == X86::NoRegister) {
+ // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+ Offset = PI->getOperand(4).getImm();
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = -PI->getOperand(2).getImm();
+ } else
+ return CalcNewOffset(0);
+
+ FoundStackAdjust(PI, Offset);
+ if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk)
+ break;
+
+ if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end()))
+ return CalcNewOffset(0);
+
+ PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI);
+ }
PI = MBB.erase(PI);
if (PI != MBB.end() && PI->isCFIInstruction()) {
@@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (!doMergeWithPrevious)
MBBI = skipDebugInstructionsForward(PI, MBB.end());
- return Offset;
+ return CalcNewOffset(Offset);
+}
+
+int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ int64_t AddOffset,
+ bool doMergeWithPrevious) const {
+ return mergeSPUpdates(
+ MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; },
+ doMergeWithPrevious);
}
void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
@@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// If there is an SUB32ri of ESP immediately before this instruction, merge
// the two. This can be the case when tail call elimination is enabled and
- // the callee has more arguments then the caller.
- NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+ // the callee has more arguments than the caller.
+ NumBytes = mergeSPUpdates(
+ MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; },
+ true);
// Adjust stack pointer: ESP -= numbytes.
@@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (HasFP) {
if (X86FI->hasSwiftAsyncContext()) {
// Discard the context.
- int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
+ int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true);
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
}
// Pop EBP.
@@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is an ADD32ri or SUB32ri of ESP immediately before this
// instruction, merge the two instructions.
if (NumBytes || MFI.hasVarSizedObjects())
- NumBytes += mergeSPUpdates(MBB, MBBI, true);
+ NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true);
// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
@@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
- int Offset = -1 * X86FI->getTCReturnAddrDelta();
+ int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
assert(Offset >= 0 && "TCDelta should never be positive");
if (Offset) {
// Check for possible merge with preceding ADD instruction.
- Offset += mergeSPUpdates(MBB, Terminator, true);
+ Offset = mergeSPAdd(MBB, Terminator, Offset, true);
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
@@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
// Add Amount to SP to destroy a frame, or subtract to setup.
int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+ int64_t CfaAdjustment = StackAdjustment;
if (StackAdjustment) {
// Merge with any previous or following adjustment instruction. Note: the
// instructions merged with here do not have CFI, so their stack
- // adjustments do not feed into CfaAdjustment.
- StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
- StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
+ // adjustments do not feed into CfaAdjustment
+
+ auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI,
+ int64_t Offset) {
+ CfaAdjustment += Offset;
+ };
+ auto CalcNewOffset = [&StackAdjustment](int64_t Offset) {
+ return StackAdjustment + Offset;
+ };
+ StackAdjustment =
+ mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true);
+ StackAdjustment =
+ mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false);
if (StackAdjustment) {
if (!(F.hasMinSize() &&
@@ -3830,7 +3868,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
}
}
- if (DwarfCFI && !hasFP(MF)) {
+ if (DwarfCFI && !hasFP(MF) && CfaAdjustment) {
// If we don't have FP, but need to generate unwind information,
// we need to set the correct CFA offset after the stack adjustment.
// How much we adjust the CFA offset depends on whether we're emitting
@@ -3838,14 +3876,11 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
// offset to be correct at each call site, while for debugging we want
// it to be more precise.
- int64_t CfaAdjustment = -StackAdjustment;
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
- if (CfaAdjustment) {
- BuildCFI(
- MBB, InsertPos, DL,
- MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
- }
+ BuildCFI(
+ MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment));
}
return I;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 02fe8ee..ef41b46 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -134,12 +134,50 @@ public:
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
RegScavenger *RS) const override;
- /// Check the instruction before/after the passed instruction. If
- /// it is an ADD/SUB/LEA instruction it is deleted argument and the
- /// stack adjustment is returned as a positive value for ADD/LEA and
- /// a negative for SUB.
- int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- bool doMergeWithPrevious) const;
+private:
+ /// Basic Pseudocode:
+ /// if (instruction before/after the passed instruction is ADD/SUB/LEA)
+ /// Offset = instruction stack adjustment
+ /// ... positive value for ADD/LEA and negative for SUB
+ /// FoundStackAdjust(instruction, Offset)
+ /// erase(instruction)
+ /// return CalcNewOffset(Offset)
+ /// else
+ /// return CalcNewOffset(0)
+ ///
+ /// It's possible that the selected instruction is not immediately
+ /// before/after MBBI for large adjustments that have been split into multiple
+ /// instructions.
+ ///
+ /// FoundStackAdjust should have the signature:
+ /// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset)
+ /// CalcNewOffset should have the signature:
+ /// int64_t CalcNewOffset(int64_t Offset)
+ template <typename FoundT, typename CalcT>
+ int64_t mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ FoundT FoundStackAdjust, CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const;
+
+ template <typename CalcT>
+ int64_t mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset,
+ bool doMergeWithPrevious) const {
+ auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI,
+ int64_t Offset) {};
+ return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset,
+ doMergeWithPrevious);
+ }
+
+public:
+ /// Equivalent to:
+ /// mergeSPUpdates(MBB, MBBI,
+ /// [AddOffset](int64_t Offset) {
+ /// return AddOffset + Offset;
+ /// },
+ /// doMergeWithPrevious);
+ int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ int64_t AddOffset, bool doMergeWithPrevious) const;
/// Emit a series of instructions to increment / decrement the stack
/// pointer by a constant value.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8f90420..6cf6061 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2944,7 +2944,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,
}
/// Return true if the condition is an signed comparison operation.
-static bool isX86CCSigned(unsigned X86CC) {
+static bool isX86CCSigned(X86::CondCode X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
@@ -22975,7 +22975,7 @@ static bool isProfitableToUseFlagOp(SDValue Op) {
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// CF and OF aren't always set the way we want. Determine which
// of these we need.
@@ -23085,7 +23085,7 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (isNullConstant(Op1))
@@ -23157,10 +23157,17 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
return Add.getValue(1);
}
- // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ // If we already have an XOR of the ops, use that to check for equality.
+ // Else use SUB instead of CMP to enable CSE between SUB and CMP.
+ unsigned X86Opc = X86ISD::SUB;
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
+ DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
+ X86Opc = X86ISD::XOR;
+
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return Sub.getValue(1);
+ SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
+ return CmpOp.getValue(1);
}
bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 9bcfa6c..8f97537 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -403,13 +403,12 @@ static ARM::FPUKind findSinglePrecisionFPU(ARM::FPUKind InputFPUKind) {
if (!ARM::isDoublePrecision(InputFPU.Restriction))
return InputFPUKind;
- // Otherwise, look for an FPU entry with all the same fields, except
- // that it does not support double precision.
+ // Otherwise, look for an FPU entry that has the same FPUVer
+ // and is not Double Precision. We want to allow for changing of
+ // NEON Support and Restrictions so CPU's such as Cortex-R52 can
+ // select between SP Only and Full DP modes.
for (const ARM::FPUName &CandidateFPU : ARM::FPUNames) {
if (CandidateFPU.FPUVer == InputFPU.FPUVer &&
- CandidateFPU.NeonSupport == InputFPU.NeonSupport &&
- ARM::has32Regs(CandidateFPU.Restriction) ==
- ARM::has32Regs(InputFPU.Restriction) &&
!ARM::isDoublePrecision(CandidateFPU.Restriction)) {
return CandidateFPU.ID;
}
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 41bc67f..34ddeeb 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -1184,22 +1184,22 @@ static std::optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
Incoming = PN->getIncomingValue(Idx);
IncomingBlock = PN->getIncomingBlock(Idx);
+ // If the incoming block isn't in the region, we don't have to worry about
+ // this incoming value.
+ if (!Blocks.contains(IncomingBlock))
+ continue;
+
// If we cannot find a GVN, and the incoming block is included in the region
// this means that the input to the PHINode is not included in the region we
// are trying to analyze, meaning, that if it was outlined, we would be
// adding an extra input. We ignore this case for now, and so ignore the
// region.
std::optional<unsigned> OGVN = Cand.getGVN(Incoming);
- if (!OGVN && Blocks.contains(IncomingBlock)) {
+ if (!OGVN) {
Region.IgnoreRegion = true;
return std::nullopt;
}
- // If the incoming block isn't in the region, we don't have to worry about
- // this incoming value.
- if (!Blocks.contains(IncomingBlock))
- continue;
-
// Collect the canonical numbers of the values in the PHINode.
unsigned GVN = *OGVN;
OGVN = Cand.getCanonicalNum(GVN);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index eab15ac..f3f2e50 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3493,11 +3493,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Instrument generic vector reduction intrinsics
// by ORing together all their fields.
+ //
+ // The return type does not need to be the same type as the fields
+ // e.g., declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
void handleVectorReduceIntrinsic(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
+ S = CreateShadowCast(IRB, S, getShadowTy(&I));
setShadow(&I, S);
- setOrigin(&I, getOrigin(&I, 0));
+ setOriginForNaryOp(I);
+ }
+
+ // Similar to handleVectorReduceIntrinsic but with an initial starting value.
+ // e.g., call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float>
+ // %a1)
+ // shadow = shadow[a0] | shadow[a1.0] | shadow[a1.1]
+ void handleVectorReduceWithStarterIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Shadow0 = getShadow(&I, 0);
+ Value *Shadow1 = IRB.CreateOrReduce(getShadow(&I, 1));
+ Value *S = IRB.CreateOr(Shadow0, Shadow1);
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
}
// Instrument vector.reduce.or intrinsic.
@@ -4346,8 +4363,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_xor:
case Intrinsic::vector_reduce_mul:
+ // Add reduction to scalar
+ case Intrinsic::aarch64_neon_faddv:
+ case Intrinsic::aarch64_neon_saddv:
+ case Intrinsic::aarch64_neon_uaddv:
handleVectorReduceIntrinsic(I);
break;
+ case Intrinsic::vector_reduce_fadd:
+ case Intrinsic::vector_reduce_fmul:
+ handleVectorReduceWithStarterIntrinsic(I);
+ break;
+
case Intrinsic::x86_sse_stmxcsr:
handleStmxcsr(I);
break;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 539c922..558d75c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7634,6 +7634,60 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
}
+/// Builds the arguments types vector for the given call instruction with the
+/// given \p ID for the specified vector factor.
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+ const unsigned VF, unsigned MinBW,
+ const TargetTransformInfo *TTI) {
+ SmallVector<Type *> ArgTys;
+ for (auto [Idx, Arg] : enumerate(CI->args())) {
+ if (ID != Intrinsic::not_intrinsic) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
+ ArgTys.push_back(Arg->getType());
+ continue;
+ }
+ if (MinBW > 0) {
+ ArgTys.push_back(
+ getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
+ continue;
+ }
+ }
+ ArgTys.push_back(getWidenedType(Arg->getType(), VF));
+ }
+ return ArgTys;
+}
+
+/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
+/// function (if possible) calls.
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
+ FMF = FPCI->getFastMathFlags();
+ IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
+ auto IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
+ auto Shape = VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(VecTy->getNumElements()),
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ auto LibCost = IntrinsicCost;
+ if (!CI->isNoBuiltin() && VecFunc) {
+ // Calculate the cost of the vector library call.
+ // If the corresponding vector call is cheaper, return its cost.
+ LibCost =
+ TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
+ }
+ return {IntrinsicCost, LibCost};
+}
+
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -9017,34 +9071,6 @@ bool BoUpSLP::areAllUsersVectorized(
});
}
-static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
- TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- ArrayRef<Type *> ArgTys) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- // Calculate the cost of the scalar and vector calls.
- FastMathFlags FMF;
- if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
- FMF = FPCI->getFastMathFlags();
- IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
- auto IntrinsicCost =
- TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
-
- auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(VecTy->getNumElements()),
- false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- auto LibCost = IntrinsicCost;
- if (!CI->isNoBuiltin() && VecFunc) {
- // Calculate the cost of the vector library call.
- // If the corresponding vector call is cheaper, return its cost.
- LibCost =
- TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
- }
- return {IntrinsicCost, LibCost};
-}
-
void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
SmallVectorImpl<Value *> *OpScalars,
@@ -11045,30 +11071,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
return TTI::CastContextHint::None;
}
-/// Builds the arguments types vector for the given call instruction with the
-/// given \p ID for the specified vector factor.
-static SmallVector<Type *>
-buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
- const unsigned VF, unsigned MinBW,
- const TargetTransformInfo *TTI) {
- SmallVector<Type *> ArgTys;
- for (auto [Idx, Arg] : enumerate(CI->args())) {
- if (ID != Intrinsic::not_intrinsic) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
- ArgTys.push_back(Arg->getType());
- continue;
- }
- if (MinBW > 0) {
- ArgTys.push_back(
- getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
- continue;
- }
- }
- ArgTys.push_back(getWidenedType(Arg->getType(), VF));
- }
- return ArgTys;
-}
-
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {