aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/Loads.cpp6
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp15
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp55
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp2
-rw-r--r--llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp9
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp5
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp12
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp5
-rw-r--r--llvm/lib/CodeGen/MachineFunctionSplitter.cpp11
-rw-r--r--llvm/lib/CodeGen/MachinePassManager.cpp183
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp144
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/FastISel.cpp17
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp37
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp16
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/MatchContext.h175
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp79
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp29
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp10
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp73
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp22
-rw-r--r--llvm/lib/IR/AsmWriter.cpp78
-rw-r--r--llvm/lib/IR/Attributes.cpp5
-rw-r--r--llvm/lib/IR/BasicBlock.cpp18
-rw-r--r--llvm/lib/IR/Constants.cpp94
-rw-r--r--llvm/lib/IR/DebugProgramInstruction.cpp40
-rw-r--r--llvm/lib/IR/Instructions.cpp1
-rw-r--r--llvm/lib/IR/LLVMContextImpl.cpp2
-rw-r--r--llvm/lib/IR/LLVMContextImpl.h4
-rw-r--r--llvm/lib/LTO/LTOBackend.cpp2
-rw-r--r--llvm/lib/Object/SymbolSize.cpp7
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp48
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp65
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedTSV110.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp425
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h19
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h21
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td19
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h54
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td79
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td54
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td500
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td17
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp56
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td80
-rw-r--r--llvm/lib/Target/Hexagon/CMakeLists.txt3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp274
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp56
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp689
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp32
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp324
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h12
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp30
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp257
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp166
-rw-r--r--llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp31
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td21
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp17
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVISelLowering.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp36
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp19
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp26
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp9
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.h7
-rw-r--r--llvm/lib/TargetParser/Host.cpp7
-rw-r--r--llvm/lib/Transforms/IPO/HotColdSplitting.cpp155
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp1
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp23
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp14
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp10
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/BasicBlockUtils.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp45
-rw-r--r--llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp3
-rw-r--r--llvm/lib/Transforms/Utils/ValueMapper.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp446
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h3
122 files changed, 4053 insertions, 1716 deletions
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 6bf0d2f..5916d2a 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
if (Size.getBitWidth() > 64)
return false;
- const uint64_t LoadSize = Size.getZExtValue();
+ const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue());
// Otherwise, be a little bit aggressive by scanning the local block where we
// want to check to see if the pointer is already being loaded or stored
@@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
// Handle trivial cases.
if (AccessedPtr == V &&
- LoadSize <= DL.getTypeStoreSize(AccessedTy))
+ TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
return true;
if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
- LoadSize <= DL.getTypeStoreSize(AccessedTy))
+ TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
return true;
}
return false;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 04f3172..653b3d4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7194,6 +7194,21 @@ bool llvm::propagatesPoison(const Use &PoisonOp) {
// corresponding lanes are poison.
return true;
case Intrinsic::ctpop:
+ case Intrinsic::ctlz:
+ case Intrinsic::cttz:
+ case Intrinsic::abs:
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ case Intrinsic::bitreverse:
+ case Intrinsic::bswap:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::sshl_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::ushl_sat:
return true;
}
}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 515a1d0..832907a 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3060,48 +3060,49 @@ Error BitcodeReader::parseConstants() {
V = Constant::getNullValue(CurTy);
break;
case bitc::CST_CODE_INTEGER: // INTEGER: [intval]
- if (!CurTy->isIntegerTy() || Record.empty())
+ if (!CurTy->isIntOrIntVectorTy() || Record.empty())
return error("Invalid integer const record");
V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
break;
case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
- if (!CurTy->isIntegerTy() || Record.empty())
+ if (!CurTy->isIntOrIntVectorTy() || Record.empty())
return error("Invalid wide integer const record");
- APInt VInt =
- readWideAPInt(Record, cast<IntegerType>(CurTy)->getBitWidth());
- V = ConstantInt::get(Context, VInt);
-
+ auto *ScalarTy = cast<IntegerType>(CurTy->getScalarType());
+ APInt VInt = readWideAPInt(Record, ScalarTy->getBitWidth());
+ V = ConstantInt::get(CurTy, VInt);
break;
}
case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval]
if (Record.empty())
return error("Invalid float const record");
- if (CurTy->isHalfTy())
- V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(),
- APInt(16, (uint16_t)Record[0])));
- else if (CurTy->isBFloatTy())
- V = ConstantFP::get(Context, APFloat(APFloat::BFloat(),
- APInt(16, (uint32_t)Record[0])));
- else if (CurTy->isFloatTy())
- V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(),
- APInt(32, (uint32_t)Record[0])));
- else if (CurTy->isDoubleTy())
- V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(),
- APInt(64, Record[0])));
- else if (CurTy->isX86_FP80Ty()) {
+
+ auto *ScalarTy = CurTy->getScalarType();
+ if (ScalarTy->isHalfTy())
+ V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEhalf(),
+ APInt(16, (uint16_t)Record[0])));
+ else if (ScalarTy->isBFloatTy())
+ V = ConstantFP::get(
+ CurTy, APFloat(APFloat::BFloat(), APInt(16, (uint32_t)Record[0])));
+ else if (ScalarTy->isFloatTy())
+ V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEsingle(),
+ APInt(32, (uint32_t)Record[0])));
+ else if (ScalarTy->isDoubleTy())
+ V = ConstantFP::get(
+ CurTy, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0])));
+ else if (ScalarTy->isX86_FP80Ty()) {
// Bits are not stored the same way as a normal i80 APInt, compensate.
uint64_t Rearrange[2];
Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
Rearrange[1] = Record[0] >> 48;
- V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(),
- APInt(80, Rearrange)));
- } else if (CurTy->isFP128Ty())
- V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(),
- APInt(128, Record)));
- else if (CurTy->isPPC_FP128Ty())
- V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(),
- APInt(128, Record)));
+ V = ConstantFP::get(
+ CurTy, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange)));
+ } else if (ScalarTy->isFP128Ty())
+ V = ConstantFP::get(CurTy,
+ APFloat(APFloat::IEEEquad(), APInt(128, Record)));
+ else if (ScalarTy->isPPC_FP128Ty())
+ V = ConstantFP::get(
+ CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record)));
else
V = UndefValue::get(CurTy);
break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 13be0b0..656f2a6 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2624,7 +2624,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
}
} else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
Code = bitc::CST_CODE_FLOAT;
- Type *Ty = CFP->getType();
+ Type *Ty = CFP->getType()->getScalarType();
if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() ||
Ty->isDoubleTy()) {
Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 7b66a85..3b84624 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -829,11 +829,7 @@ class MemLocFragmentFill {
void process(BasicBlock &BB, VarFragMap &LiveSet) {
BBInsertBeforeMap[&BB].clear();
for (auto &I : BB) {
- for (DbgRecord &DR : I.getDbgValueRange()) {
- // FIXME: DPValue::filter usage needs attention in this file; we need
- // to make sure dbg.labels are handled correctly in RemoveDIs mode.
- // Cast below to ensure this gets fixed when DPLabels are introduced.
- DPValue &DPV = cast<DPValue>(DR);
+ for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
for (const VarLocInfo &Loc : *Locs) {
addDef(Loc, &DPV, *I.getParent(), LiveSet);
@@ -1919,6 +1915,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
// attached DPValues, or a non-debug instruction with attached unprocessed
// DPValues.
if (II != EI && II->hasDbgValues()) {
+ // Skip over non-variable debug records (i.e., labels). They're going to
+ // be read from IR (possibly re-ordering them within the debug record
+ // range) rather than from the analysis results.
for (DPValue &DPV : DPValue::filter(II->getDbgValueRange())) {
resetInsertionPoint(DPV);
processDPValue(DPV, LiveSet);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4036f18..feefe87 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2686,8 +2686,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
attributesPermitTailCall(F, CI, RetI, *TLI)) {
// Either we return void or the return value must be the first
// argument of a known intrinsic or library function.
- if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
- V == CI->getArgOperand(0))) {
+ if (!V || isa<UndefValue>(V) ||
+ (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+ V == CI->getArgOperand(0))) {
TailCallBBs.push_back(Pred);
}
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 3bd1542..77dc265 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -187,7 +187,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
if (!lowerCall(MIRBuilder, Info))
return false;
- if (ReturnHintAlignReg && !Info.IsTailCall) {
+ if (ReturnHintAlignReg && !Info.LoweredTailCall) {
MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg,
ReturnHintAlign);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 7c95cef..38bb808 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3275,7 +3275,17 @@ void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList,
void IRTranslator::translateDbgInfo(const Instruction &Inst,
MachineIRBuilder &MIRBuilder) {
- for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
+ for (DbgRecord &DR : Inst.getDbgValueRange()) {
+ if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+ MIRBuilder.setDebugLoc(DPL->getDebugLoc());
+ assert(DPL->getLabel() && "Missing label");
+ assert(DPL->getLabel()->isValidLocationForIntrinsic(
+ MIRBuilder.getDebugLoc()) &&
+ "Expected inlined-at fields to agree");
+ MIRBuilder.buildDbgLabel(DPL->getLabel());
+ continue;
+ }
+ DPValue &DPV = cast<DPValue>(DR);
const DILocalVariable *Variable = DPV.getVariable();
const DIExpression *Expression = DPV.getExpression();
Value *V = DPV.getVariableLocationOp(0);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 26fd12f..23ad68b 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -660,8 +660,11 @@ std::optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode,
default:
break;
case TargetOpcode::G_ADD:
- case TargetOpcode::G_PTR_ADD:
return C1 + C2;
+ case TargetOpcode::G_PTR_ADD:
+ // Types can be of different width here.
+ // Result needs to be the same width as C1, so trunc or sext C2.
+ return C1 + C2.sextOrTrunc(C1.getBitWidth());
case TargetOpcode::G_AND:
return C1 & C2;
case TargetOpcode::G_ASHR:
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index 38c1c56..0ddd945 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -109,12 +109,6 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
const MachineBlockFrequencyInfo *MBFI,
ProfileSummaryInfo *PSI) {
std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
-
- // Temporary hack to cope with AArch64's jump table encoding
- const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
- if (!TII.isMBBSafeToSplitToCold(MBB))
- return false;
-
// For instrumentation profiles and sample profiles, we use different ways
// to judge whether a block is cold and should be split.
if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) {
@@ -178,7 +172,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
if (MBB.isEHPad())
LandingPads.push_back(&MBB);
- else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
+ else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) &&
+ TII.isMBBSafeToSplitToCold(MBB) && !SplitAllEHCode)
MBB.setSectionID(MBBSectionID::ColdSectionID);
}
@@ -190,7 +185,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
// Here we have UseProfileData == true.
bool HasHotLandingPads = false;
for (const MachineBasicBlock *LP : LandingPads) {
- if (!isColdBlock(*LP, MBFI, PSI))
+ if (!isColdBlock(*LP, MBFI, PSI) || !TII.isMBBSafeToSplitToCold(*LP))
HasHotLandingPads = true;
}
if (!HasHotLandingPads) {
diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp
index d42bbe2..9a750b5 100644
--- a/llvm/lib/CodeGen/MachinePassManager.cpp
+++ b/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/MachinePassManager.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/IR/PassManagerImpl.h"
@@ -19,99 +18,121 @@
using namespace llvm;
namespace llvm {
-template class AllAnalysesOn<MachineFunction>;
+
+AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key;
+
template class AnalysisManager<MachineFunction>;
template class PassManager<MachineFunction>;
+template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+ Module>;
+template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+ MachineFunction>;
+
+bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate(
+ MachineFunction &IR, const PreservedAnalyses &PA,
+ MachineFunctionAnalysisManager::Invalidator &Inv) {
+ // MachineFunction passes should not invalidate Function analyses.
+ // TODO: verify that PA doesn't invalidate Function analyses.
+ return false;
+}
-Error MachineFunctionPassManager::run(Module &M,
- MachineFunctionAnalysisManager &MFAM) {
- // MachineModuleAnalysis is a module analysis pass that is never invalidated
- // because we don't run any module pass in codegen pipeline. This is very
- // important because the codegen state is stored in MMI which is the analysis
- // result of MachineModuleAnalysis. MMI should not be recomputed.
- auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI();
-
- (void)RequireCodeGenSCCOrder;
- assert(!RequireCodeGenSCCOrder && "not implemented");
-
- // M is unused here
- PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M);
-
- // Add a PIC to verify machine functions.
- if (VerifyMachineFunction) {
- // No need to pop this callback later since MIR pipeline is flat which means
- // current pipeline is the top-level pipeline. Callbacks are not used after
- // current pipeline.
- PI.pushBeforeNonSkippedPassCallback([](StringRef PassID, Any IR) {
- assert(llvm::any_cast<const MachineFunction *>(&IR));
- const MachineFunction *MF = llvm::any_cast<const MachineFunction *>(IR);
- assert(MF && "Machine function should be valid for printing");
- std::string Banner = std::string("After ") + std::string(PassID);
- verifyMachineFunction(Banner, *MF);
- });
+template <>
+bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
+ Module &M, const PreservedAnalyses &PA,
+ ModuleAnalysisManager::Invalidator &Inv) {
+ // If literally everything is preserved, we're done.
+ if (PA.areAllPreserved())
+ return false; // This is still a valid proxy.
+
+ // If this proxy isn't marked as preserved, then even if the result remains
+ // valid, the key itself may no longer be valid, so we clear everything.
+ //
+ // Note that in order to preserve this proxy, a module pass must ensure that
+ // the MFAM has been completely updated to handle the deletion of functions.
+ // Specifically, any MFAM-cached results for those functions need to have been
+ // forcibly cleared. When preserved, this proxy will only invalidate results
+ // cached on functions *still in the module* at the end of the module pass.
+ auto PAC = PA.getChecker<MachineFunctionAnalysisManagerModuleProxy>();
+ if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) {
+ InnerAM->clear();
+ return true;
}
- for (auto &F : InitializationFuncs) {
- if (auto Err = F(M, MFAM))
- return Err;
+ // FIXME: be more precise, see
+ // FunctionAnalysisManagerModuleProxy::Result::invalidate.
+ if (!PA.allAnalysesInSetPreserved<AllAnalysesOn<MachineFunction>>()) {
+ InnerAM->clear();
+ return true;
}
- unsigned Idx = 0;
- size_t Size = Passes.size();
- do {
- // Run machine module passes
- for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) {
- if (!PI.runBeforePass<Module>(*Passes[Idx], M))
- continue;
- if (auto Err = MachineModulePasses.at(Idx)(M, MFAM))
- return Err;
- PI.runAfterPass(*Passes[Idx], M, PreservedAnalyses::all());
- }
-
- // Finish running all passes.
- if (Idx == Size)
- break;
-
- // Run machine function passes
-
- // Get index range of machine function passes.
- unsigned Begin = Idx;
- for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx)
- ;
-
- for (Function &F : M) {
- // Do not codegen any 'available_externally' functions at all, they have
- // definitions outside the translation unit.
- if (F.hasAvailableExternallyLinkage())
- continue;
-
- MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
-
- for (unsigned I = Begin, E = Idx; I != E; ++I) {
- auto *P = Passes[I].get();
+ // Return false to indicate that this result is still a valid proxy.
+ return false;
+}
- if (!PI.runBeforePass<MachineFunction>(*P, MF))
- continue;
+PreservedAnalyses
+ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
+ auto &MMI = AM.getResult<MachineModuleAnalysis>(M).getMMI();
+ MachineFunctionAnalysisManager &MFAM =
+ AM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M).getManager();
+ PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+ PreservedAnalyses PA = PreservedAnalyses::all();
+ for (Function &F : M) {
+ // Do not codegen any 'available_externally' functions at all, they have
+ // definitions outside the translation unit.
+ if (F.hasAvailableExternallyLinkage())
+ continue;
+
+ MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+
+ if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+ continue;
+ PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+ if (MMI.getMachineFunction(F)) {
+ MFAM.invalidate(MF, PassPA);
+ PI.runAfterPass(*Pass, MF, PassPA);
+ } else {
+ MFAM.clear(MF, F.getName());
+ PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
+ }
+ PA.intersect(std::move(PassPA));
+ }
- // TODO: EmitSizeRemarks
- PreservedAnalyses PassPA = P->run(MF, MFAM);
+ return PA;
+}
- // MF is dangling after FreeMachineFunctionPass
- if (P->name() != FreeMachineFunctionPass::name()) {
- MFAM.invalidate(MF, PassPA);
+void ModuleToMachineFunctionPassAdaptor::printPipeline(
+ raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+ OS << "machine-function(";
+ Pass->printPipeline(OS, MapClassName2PassName);
+ OS << ')';
+}
- PI.runAfterPass(*P, MF, PassPA);
- }
- }
+template <>
+PreservedAnalyses
+PassManager<MachineFunction>::run(MachineFunction &MF,
+ AnalysisManager<MachineFunction> &MFAM) {
+ PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
+ Function &F = MF.getFunction();
+ MachineModuleInfo &MMI =
+ MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+ .getCachedResult<MachineModuleAnalysis>(*F.getParent())
+ ->getMMI();
+ PreservedAnalyses PA = PreservedAnalyses::all();
+ for (auto &Pass : Passes) {
+ if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+ continue;
+
+ PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+ if (MMI.getMachineFunction(F)) {
+ MFAM.invalidate(MF, PassPA);
+ PI.runAfterPass(*Pass, MF, PassPA);
+ } else {
+ MFAM.clear(MF, F.getName());
+ PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
}
- } while (true);
-
- for (auto &F : FinalizationFuncs) {
- if (auto Err = F(M, MFAM))
- return Err;
+ PA.intersect(std::move(PassPA));
}
-
- return Error::success();
+ return PA;
}
} // namespace llvm
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 697e0da..1bda19b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -768,7 +768,6 @@ static void getUnderlyingObjects(const MachineInstr *MI,
Objs.clear();
return;
}
- Objs.push_back(V);
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 89ef648..6a28bc8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -76,6 +76,8 @@
#include <utility>
#include <variant>
+#include "MatchContext.h"
+
using namespace llvm;
#define DEBUG_TYPE "dagcombine"
@@ -888,141 +890,6 @@ public:
void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
};
-class EmptyMatchContext {
- SelectionDAG &DAG;
- const TargetLowering &TLI;
-
-public:
- EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
- : DAG(DAG), TLI(TLI) {}
-
- bool match(SDValue OpN, unsigned Opcode) const {
- return Opcode == OpN->getOpcode();
- }
-
- // Same as SelectionDAG::getNode().
- template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
- return DAG.getNode(std::forward<ArgT>(Args)...);
- }
-
- bool isOperationLegalOrCustom(unsigned Op, EVT VT,
- bool LegalOnly = false) const {
- return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
- }
-};
-
-class VPMatchContext {
- SelectionDAG &DAG;
- const TargetLowering &TLI;
- SDValue RootMaskOp;
- SDValue RootVectorLenOp;
-
-public:
- VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
- : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
- assert(Root->isVPOpcode());
- if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
- RootMaskOp = Root->getOperand(*RootMaskPos);
- else if (Root->getOpcode() == ISD::VP_SELECT)
- RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
- Root->getOperand(0).getValueType());
-
- if (auto RootVLenPos =
- ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
- RootVectorLenOp = Root->getOperand(*RootVLenPos);
- }
-
- /// whether \p OpVal is a node that is functionally compatible with the
- /// NodeType \p Opc
- bool match(SDValue OpVal, unsigned Opc) const {
- if (!OpVal->isVPOpcode())
- return OpVal->getOpcode() == Opc;
-
- auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
- !OpVal->getFlags().hasNoFPExcept());
- if (BaseOpc != Opc)
- return false;
-
- // Make sure the mask of OpVal is true mask or is same as Root's.
- unsigned VPOpcode = OpVal->getOpcode();
- if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
- SDValue MaskOp = OpVal.getOperand(*MaskPos);
- if (RootMaskOp != MaskOp &&
- !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
- return false;
- }
-
- // Make sure the EVL of OpVal is same as Root's.
- if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
- if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
- return false;
- return true;
- }
-
- // Specialize based on number of operands.
- // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
- // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
- // DAG.getNode(Opcode, DL, VT); }
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
- return DAG.getNode(VPOpcode, DL, VT,
- {Operand, RootMaskOp, RootVectorLenOp});
- }
-
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
- SDValue N2) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
- return DAG.getNode(VPOpcode, DL, VT,
- {N1, N2, RootMaskOp, RootVectorLenOp});
- }
-
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
- SDValue N2, SDValue N3) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
- return DAG.getNode(VPOpcode, DL, VT,
- {N1, N2, N3, RootMaskOp, RootVectorLenOp});
- }
-
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
- SDNodeFlags Flags) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
- return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
- Flags);
- }
-
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
- SDValue N2, SDNodeFlags Flags) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
- return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
- Flags);
- }
-
- SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
- SDValue N2, SDValue N3, SDNodeFlags Flags) {
- unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
- assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
- ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
- return DAG.getNode(VPOpcode, DL, VT,
- {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
- }
-
- bool isOperationLegalOrCustom(unsigned Op, EVT VT,
- bool LegalOnly = false) const {
- unsigned VPOp = ISD::getVPForBaseOpcode(Op);
- return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
- }
-};
-
} // end anonymous namespace
//===----------------------------------------------------------------------===//
@@ -13997,6 +13864,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
return Res;
+ // CSE zext nneg with sext if the zext is not free.
+ if (N->getFlags().hasNonNeg() && !TLI.isZExtFree(N0.getValueType(), VT)) {
+ SDNode *CSENode = DAG.getNodeIfExists(ISD::SIGN_EXTEND, N->getVTList(), N0);
+ if (CSENode)
+ return SDValue(CSENode, 0);
+ }
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5651498..246762d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1188,11 +1188,24 @@ void FastISel::handleDbgInfo(const Instruction *II) {
MIMD = MIMetadata();
// Reverse order of debug records, because fast-isel walks through backwards.
- for (DbgRecord &DPR : llvm::reverse(II->getDbgValueRange())) {
+ for (DbgRecord &DR : llvm::reverse(II->getDbgValueRange())) {
flushLocalValueMap();
recomputeInsertPt();
- DPValue &DPV = cast<DPValue>(DPR);
+ if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+ assert(DPL->getLabel() && "Missing label");
+ if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+ LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DPL << "\n");
+ continue;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DPL->getDebugLoc(),
+ TII.get(TargetOpcode::DBG_LABEL))
+ .addMetadata(DPL->getLabel());
+ continue;
+ }
+
+ DPValue &DPV = cast<DPValue>(DR);
Value *V = nullptr;
if (!DPV.hasArgList())
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a4ba261..df17d65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -217,7 +217,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::SSUBSAT:
case ISD::USUBSAT:
case ISD::SSHLSAT:
- case ISD::USHLSAT: Res = PromoteIntRes_ADDSUBSHLSAT(N); break;
+ case ISD::USHLSAT:
+ Res = PromoteIntRes_ADDSUBSHLSAT<EmptyMatchContext>(N);
+ break;
+ case ISD::VP_SADDSAT:
+ case ISD::VP_UADDSAT:
+ case ISD::VP_SSUBSAT:
+ case ISD::VP_USUBSAT:
+ Res = PromoteIntRes_ADDSUBSHLSAT<VPMatchContext>(N);
+ break;
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
@@ -934,6 +942,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT);
}
+template <class MatchContextClass>
SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
// If the promoted type is legal, we can convert this to:
// 1. ANY_EXTEND iN to iM
@@ -945,11 +954,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
SDLoc dl(N);
SDValue Op1 = N->getOperand(0);
SDValue Op2 = N->getOperand(1);
+ MatchContextClass matcher(DAG, TLI, N);
unsigned OldBits = Op1.getScalarValueSizeInBits();
- unsigned Opcode = N->getOpcode();
+ unsigned Opcode = matcher.getRootBaseOpcode();
bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT;
+ // FIXME: We need vp-aware PromotedInteger functions.
SDValue Op1Promoted, Op2Promoted;
if (IsShift) {
Op1Promoted = GetPromotedInteger(Op1);
@@ -968,18 +979,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits);
SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
SDValue Add =
- DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
- return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+ matcher.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+ return matcher.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
}
// USUBSAT can always be promoted as long as we have zero-extended the args.
if (Opcode == ISD::USUBSAT)
- return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
- Op2Promoted);
+ return matcher.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
+ Op2Promoted);
// Shift cannot use a min/max expansion, we can't detect overflow if all of
// the bits have been shifted out.
- if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
+ if (IsShift || matcher.isOperationLegal(Opcode, PromotedType)) {
unsigned ShiftOp;
switch (Opcode) {
case ISD::SADDSAT:
@@ -1002,11 +1013,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
if (!IsShift)
Op2Promoted =
- DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+ matcher.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
SDValue Result =
- DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
- return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+ matcher.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+ return matcher.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
}
unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
@@ -1015,9 +1026,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
SDValue Result =
- DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
- Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
- Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+ matcher.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+ Result = matcher.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+ Result = matcher.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
return Result;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9114987..3c84f67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
#define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+#include "MatchContext.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -355,6 +356,7 @@ private:
SDValue PromoteIntRes_VAARG(SDNode *N);
SDValue PromoteIntRes_VSCALE(SDNode *N);
SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+ template <class MatchContextClass>
SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N);
SDValue PromoteIntRes_MULFIX(SDNode *N);
SDValue PromoteIntRes_DIVFIX(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2a7aaf8..6074498 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -404,8 +404,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FRINT:
- case ISD::LRINT:
- case ISD::LLRINT:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FROUNDEVEN:
@@ -455,6 +453,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
Node->getValueType(0), Scale);
break;
}
+ case ISD::LRINT:
+ case ISD::LLRINT:
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7fc2526..90cda2a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1163,10 +1163,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::SMAX: case ISD::VP_SMAX:
case ISD::UMIN: case ISD::VP_UMIN:
case ISD::UMAX: case ISD::VP_UMAX:
- case ISD::SADDSAT:
- case ISD::UADDSAT:
- case ISD::SSUBSAT:
- case ISD::USUBSAT:
+ case ISD::SADDSAT: case ISD::VP_SADDSAT:
+ case ISD::UADDSAT: case ISD::VP_UADDSAT:
+ case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
+ case ISD::USUBSAT: case ISD::VP_USUBSAT:
case ISD::SSHLSAT:
case ISD::USHLSAT:
case ISD::ROTL:
@@ -4140,10 +4140,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::SMAX: case ISD::VP_SMAX:
case ISD::UMIN: case ISD::VP_UMIN:
case ISD::UMAX: case ISD::VP_UMAX:
- case ISD::UADDSAT:
- case ISD::SADDSAT:
- case ISD::USUBSAT:
- case ISD::SSUBSAT:
+ case ISD::UADDSAT: case ISD::VP_UADDSAT:
+ case ISD::SADDSAT: case ISD::VP_SADDSAT:
+ case ISD::USUBSAT: case ISD::VP_USUBSAT:
+ case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
case ISD::SSHLSAT:
case ISD::USHLSAT:
case ISD::ROTL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
new file mode 100644
index 0000000..f965cb9
--- /dev/null
+++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
@@ -0,0 +1,175 @@
+//===---------------- llvm/CodeGen/MatchContext.h --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the EmptyMatchContext class and VPMatchContext class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+using namespace llvm;
+
+namespace {
+class EmptyMatchContext {
+ SelectionDAG &DAG;
+ const TargetLowering &TLI;
+ SDNode *Root;
+
+public:
+ EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
+ : DAG(DAG), TLI(TLI), Root(Root) {}
+
+ unsigned getRootBaseOpcode() { return Root->getOpcode(); }
+ bool match(SDValue OpN, unsigned Opcode) const {
+ return Opcode == OpN->getOpcode();
+ }
+
+ // Same as SelectionDAG::getNode().
+ template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
+ return DAG.getNode(std::forward<ArgT>(Args)...);
+ }
+
+ bool isOperationLegal(unsigned Op, EVT VT) const {
+ return TLI.isOperationLegal(Op, VT);
+ }
+
+ bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+ bool LegalOnly = false) const {
+ return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
+ }
+};
+
+class VPMatchContext {
+ SelectionDAG &DAG;
+ const TargetLowering &TLI;
+ SDValue RootMaskOp;
+ SDValue RootVectorLenOp;
+ SDNode *Root;
+
+public:
+ VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *_Root)
+ : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
+ Root = _Root;
+ assert(Root->isVPOpcode());
+ if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
+ RootMaskOp = Root->getOperand(*RootMaskPos);
+ else if (Root->getOpcode() == ISD::VP_SELECT)
+ RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
+ Root->getOperand(0).getValueType());
+
+ if (auto RootVLenPos = ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
+ RootVectorLenOp = Root->getOperand(*RootVLenPos);
+ }
+
+ unsigned getRootBaseOpcode() {
+ std::optional<unsigned> Opcode = ISD::getBaseOpcodeForVP(
+ Root->getOpcode(), !Root->getFlags().hasNoFPExcept());
+ assert(Opcode.has_value());
+ return *Opcode;
+ }
+
+ /// whether \p OpVal is a node that is functionally compatible with the
+ /// NodeType \p Opc
+ bool match(SDValue OpVal, unsigned Opc) const {
+ if (!OpVal->isVPOpcode())
+ return OpVal->getOpcode() == Opc;
+
+ auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
+ !OpVal->getFlags().hasNoFPExcept());
+ if (BaseOpc != Opc)
+ return false;
+
+ // Make sure the mask of OpVal is true mask or is same as Root's.
+ unsigned VPOpcode = OpVal->getOpcode();
+ if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
+ SDValue MaskOp = OpVal.getOperand(*MaskPos);
+ if (RootMaskOp != MaskOp &&
+ !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
+ return false;
+ }
+
+ // Make sure the EVL of OpVal is same as Root's.
+ if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
+ if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
+ return false;
+ return true;
+ }
+
+ // Specialize based on number of operands.
+ // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
+ // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
+ // DAG.getNode(Opcode, DL, VT); }
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+ return DAG.getNode(VPOpcode, DL, VT,
+ {Operand, RootMaskOp, RootVectorLenOp});
+ }
+
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+ SDValue N2) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+ return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp});
+ }
+
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+ SDValue N2, SDValue N3) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+ return DAG.getNode(VPOpcode, DL, VT,
+ {N1, N2, N3, RootMaskOp, RootVectorLenOp});
+ }
+
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
+ SDNodeFlags Flags) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+ return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
+ Flags);
+ }
+
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+ SDValue N2, SDNodeFlags Flags) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+ return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
+ Flags);
+ }
+
+ SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+ SDValue N2, SDValue N3, SDNodeFlags Flags) {
+ unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+ assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+ ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+ return DAG.getNode(VPOpcode, DL, VT,
+ {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
+ }
+
+ bool isOperationLegal(unsigned Op, EVT VT) const {
+ unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+ return TLI.isOperationLegal(VPOp, VT);
+ }
+
+ bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+ bool LegalOnly = false) const {
+ unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+ return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
+ }
+};
+} // end anonymous namespace
+#endif
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index add92cf..0ceda27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9044,29 +9044,6 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
SDValue SelectionDAG::getStridedLoadVP(
ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
- SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
- MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
- const MDNode *Ranges, bool IsExpanding) {
- assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
- MMOFlags |= MachineMemOperand::MOLoad;
- assert((MMOFlags & MachineMemOperand::MOStore) == 0);
- // If we don't have a PtrInfo, infer the trivial frame index case to simplify
- // clients.
- if (PtrInfo.V.isNull())
- PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
-
- uint64_t Size = MemoryLocation::UnknownSize;
- MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
- Alignment, AAInfo, Ranges);
- return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask,
- EVL, MemVT, MMO, IsExpanding);
-}
-
-SDValue SelectionDAG::getStridedLoadVP(
- ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
- SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) {
bool Indexed = AM != ISD::UNINDEXED;
assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
@@ -9098,17 +9075,6 @@ SDValue SelectionDAG::getStridedLoadVP(
return V;
}
-SDValue SelectionDAG::getStridedLoadVP(
- EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride,
- SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment,
- MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
- const MDNode *Ranges, bool IsExpanding) {
- SDValue Undef = getUNDEF(Ptr.getValueType());
- return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
- Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment,
- MMOFlags, AAInfo, Ranges, IsExpanding);
-}
-
SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
SDValue Ptr, SDValue Stride,
SDValue Mask, SDValue EVL,
@@ -9121,18 +9087,6 @@ SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
SDValue SelectionDAG::getExtStridedLoadVP(
ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
- SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL,
- MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
- MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
- bool IsExpanding) {
- SDValue Undef = getUNDEF(Ptr.getValueType());
- return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
- Stride, Mask, EVL, PtrInfo, MemVT, Alignment,
- MMOFlags, AAInfo, nullptr, IsExpanding);
-}
-
-SDValue SelectionDAG::getExtStridedLoadVP(
- ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT,
MachineMemOperand *MMO, bool IsExpanding) {
SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -9150,11 +9104,14 @@ SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
auto MMOFlags =
SLD->getMemOperand()->getFlags() &
~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
- return getStridedLoadVP(
- AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(),
- Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(),
- SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags,
- SLD->getAAInfo(), nullptr, SLD->isExpandingLoad());
+ MachineFunction &MF = getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ SLD->getPointerInfo(), MMOFlags, SLD->getMemOperand()->getSize(),
+ SLD->getOriginalAlign(), SLD->getAAInfo());
+ return getStridedLoadVP(AM, SLD->getExtensionType(), OrigLoad.getValueType(),
+ DL, SLD->getChain(), Base, Offset, SLD->getStride(),
+ SLD->getMask(), SLD->getVectorLength(),
+ SLD->getMemoryVT(), MMO, SLD->isExpandingLoad());
}
SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
@@ -9193,26 +9150,6 @@ SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
return V;
}
-SDValue SelectionDAG::getTruncStridedStoreVP(
- SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride,
- SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT,
- Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
- bool IsCompressing) {
- assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
- MMOFlags |= MachineMemOperand::MOStore;
- assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
-
- if (PtrInfo.V.isNull())
- PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
-
- MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo);
- return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT,
- MMO, IsCompressing);
-}
-
SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL,
SDValue Val, SDValue Ptr,
SDValue Stride, SDValue Mask,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e893a5b..ee600d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1241,17 +1241,30 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
}
}
- // We must early-exit here to prevent any DPValues from being emitted below,
- // as we have just emitted the debug values resulting from assignment
- // tracking analysis, making any existing DPValues redundant (and probably
- // less correct).
- return;
}
+ // We must skip DPValues if they've already been processed above as we
+ // have just emitted the debug values resulting from assignment tracking
+ // analysis, making any existing DPValues redundant (and probably less
+ // correct). We still need to process DPLabels. This does sink DPLabels
+ // to the bottom of the group of debug records. That sholdn't be important
+ // as it does so deterministcally and ordering between DPLabels and DPValues
+ // is immaterial (other than for MIR/IR printing).
+ bool SkipDPValues = DAG.getFunctionVarLocs();
// Is there is any debug-info attached to this instruction, in the form of
- // DPValue non-instruction debug-info records.
- for (DbgRecord &DPR : I.getDbgValueRange()) {
- DPValue &DPV = cast<DPValue>(DPR);
+ // DbgRecord non-instruction debug-info records.
+ for (DbgRecord &DR : I.getDbgValueRange()) {
+ if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+ assert(DPL->getLabel() && "Missing label");
+ SDDbgLabel *SDV =
+ DAG.getDbgLabel(DPL->getLabel(), DPL->getDebugLoc(), SDNodeOrder);
+ DAG.AddDbgLabel(SDV);
+ continue;
+ }
+
+ if (SkipDPValues)
+ continue;
+ DPValue &DPV = cast<DPValue>(DR);
DILocalVariable *Variable = DPV.getVariable();
DIExpression *Expression = DPV.getExpression();
dropDanglingDebugInfo(Variable, Expression);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a4c5167..07fb891 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);
- // Extract the sign bit and exponent.
- SDValue SignBitAndExponentField = DAG.getNode(
- ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
- // Set the quiet bit.
- SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
- DAG.getConstant(0x400000, dl, I32));
+ // Conversions should set NaN's quiet bit. This also prevents NaNs from
+ // turning into infinities.
+ SDValue NaN =
+ DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32));
// Factor in the contribution of the low 16 bits.
SDValue One = DAG.getConstant(1, dl, I32);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 78f819d..9c65d85 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -510,7 +510,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() {
Expected<DWARFDebugNames::AttributeEncoding>
DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) {
- if (*Offset >= EntriesBase) {
+ if (*Offset >= Offsets.EntriesBase) {
return createStringError(errc::illegal_byte_sequence,
"Incorrectly terminated abbreviation table.");
}
@@ -536,7 +536,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) {
Expected<DWARFDebugNames::Abbrev>
DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
- if (*Offset >= EntriesBase) {
+ if (*Offset >= Offsets.EntriesBase) {
return createStringError(errc::illegal_byte_sequence,
"Incorrectly terminated abbreviation table.");
}
@@ -552,32 +552,50 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
return Abbrev(Code, dwarf::Tag(Tag), AbbrevOffset, std::move(*AttrEncOr));
}
+void llvm::findDebugNamesOffsets(
+ DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, uint64_t HdrSize,
+ dwarf::DwarfFormat Format, const DWARFDebugNames::Header &Hdr) {
+ uint32_t DwarfSize = (Format == llvm::dwarf::DwarfFormat::DWARF64) ? 8 : 4;
+ uint64_t Offset = HdrSize;
+ Offsets.CUsBase = Offset;
+ Offset += Hdr.CompUnitCount * DwarfSize;
+ Offset += Hdr.LocalTypeUnitCount * DwarfSize;
+ Offset += Hdr.ForeignTypeUnitCount * 8;
+
+ Offsets.BucketsBase = Offset;
+ Offset += Hdr.BucketCount * 4;
+
+ Offsets.HashesBase = Offset;
+ if (Hdr.BucketCount > 0)
+ Offset += Hdr.NameCount * 4;
+
+ Offsets.StringOffsetsBase = Offset;
+ Offset += Hdr.NameCount * DwarfSize;
+
+ Offsets.EntryOffsetsBase = Offset;
+ Offset += Hdr.NameCount * DwarfSize;
+
+ Offset += Hdr.AbbrevTableSize;
+ Offsets.EntriesBase = Offset;
+}
+
Error DWARFDebugNames::NameIndex::extract() {
const DWARFDataExtractor &AS = Section.AccelSection;
- uint64_t Offset = Base;
- if (Error E = Hdr.extract(AS, &Offset))
+ uint64_t hdrSize = Base;
+ if (Error E = Hdr.extract(AS, &hdrSize))
return E;
const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
- CUsBase = Offset;
- Offset += Hdr.CompUnitCount * SectionOffsetSize;
- Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize;
- Offset += Hdr.ForeignTypeUnitCount * 8;
- BucketsBase = Offset;
- Offset += Hdr.BucketCount * 4;
- HashesBase = Offset;
- if (Hdr.BucketCount > 0)
- Offset += Hdr.NameCount * 4;
- StringOffsetsBase = Offset;
- Offset += Hdr.NameCount * SectionOffsetSize;
- EntryOffsetsBase = Offset;
- Offset += Hdr.NameCount * SectionOffsetSize;
+ findDebugNamesOffsets(Offsets, hdrSize, Hdr.Format, Hdr);
+
+ uint64_t Offset =
+ Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize);
if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize))
return createStringError(errc::illegal_byte_sequence,
"Section too small: cannot read abbreviations.");
- EntriesBase = Offset + Hdr.AbbrevTableSize;
+ Offsets.EntriesBase = Offset + Hdr.AbbrevTableSize;
for (;;) {
auto AbbrevOr = extractAbbrev(&Offset);
@@ -679,7 +697,7 @@ void DWARFDebugNames::Entry::dumpParentIdx(
return;
}
- auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue();
+ auto AbsoluteOffset = NameIdx->Offsets.EntriesBase + FormValue.getRawUValue();
W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset);
}
@@ -708,14 +726,15 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
assert(CU < Hdr.CompUnitCount);
const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
- uint64_t Offset = CUsBase + SectionOffsetSize * CU;
+ uint64_t Offset = Offsets.CUsBase + SectionOffsetSize * CU;
return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
}
uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
assert(TU < Hdr.LocalTypeUnitCount);
const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
- uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
+ uint64_t Offset =
+ Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
}
@@ -723,7 +742,7 @@ uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
assert(TU < Hdr.ForeignTypeUnitCount);
const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
uint64_t Offset =
- CUsBase +
+ Offsets.CUsBase +
SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
return Section.AccelSection.getU64(&Offset);
}
@@ -759,28 +778,28 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
assert(0 < Index && Index <= Hdr.NameCount);
const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
uint64_t StringOffsetOffset =
- StringOffsetsBase + SectionOffsetSize * (Index - 1);
+ Offsets.StringOffsetsBase + SectionOffsetSize * (Index - 1);
uint64_t EntryOffsetOffset =
- EntryOffsetsBase + SectionOffsetSize * (Index - 1);
+ Offsets.EntryOffsetsBase + SectionOffsetSize * (Index - 1);
const DWARFDataExtractor &AS = Section.AccelSection;
uint64_t StringOffset =
AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset);
uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize);
- EntryOffset += EntriesBase;
+ EntryOffset += Offsets.EntriesBase;
return {Section.StringSection, Index, StringOffset, EntryOffset};
}
uint32_t
DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const {
assert(Bucket < Hdr.BucketCount);
- uint64_t BucketOffset = BucketsBase + 4 * Bucket;
+ uint64_t BucketOffset = Offsets.BucketsBase + 4 * Bucket;
return Section.AccelSection.getU32(&BucketOffset);
}
uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
assert(0 < Index && Index <= Hdr.NameCount);
- uint64_t HashOffset = HashesBase + 4 * (Index - 1);
+ uint64_t HashOffset = Offsets.HashesBase + 4 * (Index - 1);
return Section.AccelSection.getU32(&HashOffset);
}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 28f0564..572628f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -389,9 +389,25 @@ Error DWARFDebugLine::Prologue::parse(
if (getVersion() >= 5) {
FormParams.AddrSize = DebugLineData.getU8(Cursor);
- assert((!Cursor || DebugLineData.getAddressSize() == 0 ||
- DebugLineData.getAddressSize() == getAddressSize()) &&
- "Line table header and data extractor disagree");
+ const uint8_t DataAddrSize = DebugLineData.getAddressSize();
+ const uint8_t PrologueAddrSize = getAddressSize();
+ if (Cursor) {
+ if (DataAddrSize == 0) {
+ if (PrologueAddrSize != 4 && PrologueAddrSize != 8) {
+ RecoverableErrorHandler(createStringError(
+ errc::not_supported,
+ "parsing line table prologue at offset 0x%8.8" PRIx64
+ ": invalid address size %" PRIu8,
+ PrologueOffset, PrologueAddrSize));
+ }
+ } else if (DataAddrSize != PrologueAddrSize) {
+ RecoverableErrorHandler(createStringError(
+ errc::not_supported,
+ "parsing line table prologue at offset 0x%8.8" PRIx64 ": address "
+ "size %" PRIu8 " doesn't match architecture address size %" PRIu8,
+ PrologueOffset, PrologueAddrSize, DataAddrSize));
+ }
+ }
SegSelectorSize = DebugLineData.getU8(Cursor);
}
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 251485a..fba404c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -292,8 +292,8 @@ static const Module *getModuleFromDPI(const DPMarker *Marker) {
return M ? M->getParent() : nullptr;
}
-static const Module *getModuleFromDPI(const DPValue *DPV) {
- return DPV->getMarker() ? getModuleFromDPI(DPV->getMarker()) : nullptr;
+static const Module *getModuleFromDPI(const DbgRecord *DR) {
+ return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr;
}
static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
@@ -1141,12 +1141,14 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
CreateMetadataSlot(DPV->getVariable());
- CreateMetadataSlot(DPV->getDebugLoc());
if (DPV->isDbgAssign())
CreateMetadataSlot(DPV->getAssignID());
+ } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
+ CreateMetadataSlot(DPL->getLabel());
} else {
llvm_unreachable("unsupported DbgRecord kind");
}
+ CreateMetadataSlot(DR.getDebugLoc());
}
void SlotTracker::processInstructionMetadata(const Instruction &I) {
@@ -1505,16 +1507,39 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
AsmWriterContext &WriterCtx) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
- if (CI->getType()->isIntegerTy(1)) {
- Out << (CI->getZExtValue() ? "true" : "false");
- return;
+ Type *Ty = CI->getType();
+
+ if (Ty->isVectorTy()) {
+ Out << "splat (";
+ WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+ Out << " ";
}
- Out << CI->getValue();
+
+ if (Ty->getScalarType()->isIntegerTy(1))
+ Out << (CI->getZExtValue() ? "true" : "false");
+ else
+ Out << CI->getValue();
+
+ if (Ty->isVectorTy())
+ Out << ")";
+
return;
}
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+ Type *Ty = CFP->getType();
+
+ if (Ty->isVectorTy()) {
+ Out << "splat (";
+ WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+ Out << " ";
+ }
+
WriteAPFloatInternal(Out, CFP->getValueAPF());
+
+ if (Ty->isVectorTy())
+ Out << ")";
+
return;
}
@@ -2676,6 +2701,7 @@ public:
void printInstruction(const Instruction &I);
void printDPMarker(const DPMarker &DPI);
void printDPValue(const DPValue &DPI);
+ void printDPLabel(const DPLabel &DPL);
void printDbgRecord(const DbgRecord &DPI);
void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
@@ -4579,8 +4605,10 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
if (auto *DPV = dyn_cast<DPValue>(&DR))
printDPValue(*DPV);
+ else if (auto *DPL = dyn_cast<DPLabel>(&DR))
+ printDPLabel(*DPL);
else
- llvm_unreachable("unsupported dbg record");
+ llvm_unreachable("Unexpected DbgRecord kind");
}
void AssemblyWriter::printDPValue(const DPValue &Value) {
@@ -4622,6 +4650,16 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
Out << " }";
}
+void AssemblyWriter::printDPLabel(const DPLabel &Label) {
+ // There's no formal representation of a DPLabel -- print purely as
+ // a debugging aid.
+ Out << " DPLabel { ";
+ auto WriterCtx = getContext();
+ WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
+ Out << " marker @" << Label.getMarker();
+ Out << " }";
+}
+
void AssemblyWriter::printMetadataAttachments(
const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs,
StringRef Separator) {
@@ -4885,6 +4923,12 @@ void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
W.printDPMarker(*this);
}
+void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
+
+ ModuleSlotTracker MST(getModuleFromDPI(this), true);
+ print(ROS, MST, IsForDebug);
+}
+
void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
bool IsForDebug) const {
// There's no formal representation of a DPValue -- print purely as a
@@ -4904,6 +4948,24 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
W.printDPValue(*this);
}
+void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+ bool IsForDebug) const {
+ // There's no formal representation of a DbgLabelRecord -- print purely as
+ // a debugging aid.
+ formatted_raw_ostream OS(ROS);
+ SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
+ SlotTracker &SlotTable =
+ MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
+ auto incorporateFunction = [&](const Function *F) {
+ if (F)
+ MST.incorporateFunction(*F);
+ };
+ incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent()
+ : nullptr);
+ AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
+ W.printDPLabel(*this);
+}
+
void Value::print(raw_ostream &ROS, bool IsForDebug) const {
bool ShouldInitializeAllMetadata = false;
if (auto *I = dyn_cast<Instruction>(this))
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index fd51602..1907677 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
Callee.getFnAttribute(AttrClass::getKind());
}
+static bool isEqual(const Function &Caller, const Function &Callee,
+ const StringRef &AttrName) {
+ return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName);
+}
+
/// Compute the logical AND of the attributes of the caller and the
/// callee.
///
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 06807544..6ea876f 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -81,6 +81,12 @@ void BasicBlock::convertToNewDbgValues() {
continue;
}
+ if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
+ DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+ DLI->eraseFromParent();
+ continue;
+ }
+
if (DPVals.empty())
continue;
@@ -107,16 +113,12 @@ void BasicBlock::convertFromNewDbgValues() {
continue;
DPMarker &Marker = *Inst.DbgMarker;
- for (DbgRecord &DR : Marker.getDbgValueRange()) {
- if (auto *DPV = dyn_cast<DPValue>(&DR))
- InstList.insert(Inst.getIterator(),
- DPV->createDebugIntrinsic(getModule(), nullptr));
- else
- llvm_unreachable("unsupported DbgRecord kind");
- }
+ for (DbgRecord &DR : Marker.getDbgValueRange())
+ InstList.insert(Inst.getIterator(),
+ DR.createDebugIntrinsic(getModule(), nullptr));
Marker.eraseFromParent();
- };
+ }
// Assume no trailing DPValues: we could technically create them at the end
// of the block, after a terminator, but this would be non-cannonical and
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a38b912..e6b92aa 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -35,6 +35,20 @@
using namespace llvm;
using namespace PatternMatch;
+// As set of temporary options to help migrate how splats are represented.
+static cl::opt<bool> UseConstantIntForFixedLengthSplat(
+ "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden,
+ cl::desc("Use ConstantInt's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantFPForFixedLengthSplat(
+ "use-constant-fp-for-fixed-length-splat", cl::init(false), cl::Hidden,
+ cl::desc("Use ConstantFP's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantIntForScalableSplat(
+ "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden,
+ cl::desc("Use ConstantInt's native scalable vector splat support."));
+static cl::opt<bool> UseConstantFPForScalableSplat(
+ "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden,
+ cl::desc("Use ConstantFP's native scalable vector splat support."));
+
//===----------------------------------------------------------------------===//
// Constant Class
//===----------------------------------------------------------------------===//
@@ -825,9 +839,11 @@ bool Constant::isManifestConstant() const {
// ConstantInt
//===----------------------------------------------------------------------===//
-ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
+ConstantInt::ConstantInt(Type *Ty, const APInt &V)
: ConstantData(Ty, ConstantIntVal), Val(V) {
- assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
+ assert(V.getBitWidth() ==
+ cast<IntegerType>(Ty->getScalarType())->getBitWidth() &&
+ "Invalid constant for type");
}
ConstantInt *ConstantInt::getTrue(LLVMContext &Context) {
@@ -885,6 +901,26 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
return Slot.get();
}
+// Get a ConstantInt vector with each lane set to the same APInt.
+ConstantInt *ConstantInt::get(LLVMContext &Context, ElementCount EC,
+ const APInt &V) {
+ // Get an existing value or the insertion position.
+ std::unique_ptr<ConstantInt> &Slot =
+ Context.pImpl->IntSplatConstants[std::make_pair(EC, V)];
+ if (!Slot) {
+ IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+ VectorType *VTy = VectorType::get(ITy, EC);
+ Slot.reset(new ConstantInt(VTy, V));
+ }
+
+#ifndef NDEBUG
+ IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+ VectorType *VTy = VectorType::get(ITy, EC);
+ assert(Slot->getType() == VTy);
+#endif
+ return Slot.get();
+}
+
Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned);
@@ -1024,6 +1060,26 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
return Slot.get();
}
+// Get a ConstantFP vector with each lane set to the same APFloat.
+ConstantFP *ConstantFP::get(LLVMContext &Context, ElementCount EC,
+ const APFloat &V) {
+ // Get an existing value or the insertion position.
+ std::unique_ptr<ConstantFP> &Slot =
+ Context.pImpl->FPSplatConstants[std::make_pair(EC, V)];
+ if (!Slot) {
+ Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+ VectorType *VTy = VectorType::get(EltTy, EC);
+ Slot.reset(new ConstantFP(VTy, V));
+ }
+
+#ifndef NDEBUG
+ Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+ VectorType *VTy = VectorType::get(EltTy, EC);
+ assert(Slot->getType() == VTy);
+#endif
+ return Slot.get();
+}
+
Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative));
@@ -1036,7 +1092,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
ConstantFP::ConstantFP(Type *Ty, const APFloat &V)
: ConstantData(Ty, ConstantFPVal), Val(V) {
- assert(&V.getSemantics() == &Ty->getFltSemantics() &&
+ assert(&V.getSemantics() == &Ty->getScalarType()->getFltSemantics() &&
"FP type Mismatch");
}
@@ -1356,11 +1412,13 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
bool isZero = C->isNullValue();
bool isUndef = isa<UndefValue>(C);
bool isPoison = isa<PoisonValue>(C);
+ bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C);
+ bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C);
- if (isZero || isUndef) {
+ if (isZero || isUndef || isSplatFP || isSplatInt) {
for (unsigned i = 1, e = V.size(); i != e; ++i)
if (V[i] != C) {
- isZero = isUndef = isPoison = false;
+ isZero = isUndef = isPoison = isSplatFP = isSplatInt = false;
break;
}
}
@@ -1371,6 +1429,12 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
return PoisonValue::get(T);
if (isUndef)
return UndefValue::get(T);
+ if (isSplatFP)
+ return ConstantFP::get(C->getContext(), T->getElementCount(),
+ cast<ConstantFP>(C)->getValue());
+ if (isSplatInt)
+ return ConstantInt::get(C->getContext(), T->getElementCount(),
+ cast<ConstantInt>(C)->getValue());
// Check to see if all of the elements are ConstantFP or ConstantInt and if
// the element type is compatible with ConstantDataVector. If so, use it.
@@ -1384,6 +1448,16 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
if (!EC.isScalable()) {
+ // Maintain special handling of zero.
+ if (!V->isNullValue()) {
+ if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V))
+ return ConstantInt::get(V->getContext(), EC,
+ cast<ConstantInt>(V)->getValue());
+ if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V))
+ return ConstantFP::get(V->getContext(), EC,
+ cast<ConstantFP>(V)->getValue());
+ }
+
// If this splat is compatible with ConstantDataVector, use it instead of
// ConstantVector.
if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
@@ -1394,6 +1468,16 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
return get(Elts);
}
+ // Maintain special handling of zero.
+ if (!V->isNullValue()) {
+ if (UseConstantIntForScalableSplat && isa<ConstantInt>(V))
+ return ConstantInt::get(V->getContext(), EC,
+ cast<ConstantInt>(V)->getValue());
+ if (UseConstantFPForScalableSplat && isa<ConstantFP>(V))
+ return ConstantFP::get(V->getContext(), EC,
+ cast<ConstantFP>(V)->getValue());
+ }
+
Type *VTy = VectorType::get(V->getType(), EC);
if (V->isNullValue())
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index eb18be5..389bac4 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -64,6 +64,9 @@ void DbgRecord::deleteRecord() {
case ValueKind:
delete cast<DPValue>(this);
return;
+ case LabelKind:
+ delete cast<DPLabel>(this);
+ return;
}
llvm_unreachable("unsupported DbgRecord kind");
}
@@ -73,6 +76,9 @@ void DbgRecord::print(raw_ostream &O, bool IsForDebug) const {
case ValueKind:
cast<DPValue>(this)->print(O, IsForDebug);
return;
+ case LabelKind:
+ cast<DPLabel>(this)->print(O, IsForDebug);
+ return;
};
llvm_unreachable("unsupported DbgRecord kind");
}
@@ -83,6 +89,9 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST,
case ValueKind:
cast<DPValue>(this)->print(O, MST, IsForDebug);
return;
+ case LabelKind:
+ cast<DPLabel>(this)->print(O, MST, IsForDebug);
+ return;
};
llvm_unreachable("unsupported DbgRecord kind");
}
@@ -93,16 +102,23 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const {
switch (RecordKind) {
case ValueKind:
return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R));
+ case LabelKind:
+ return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel();
};
llvm_unreachable("unsupported DbgRecord kind");
}
bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
- if (RecordKind != R.RecordKind)
- return false;
+ return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
+}
+
+DbgInfoIntrinsic *
+DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
switch (RecordKind) {
case ValueKind:
- return cast<DPValue>(this)->isEquivalentTo(*cast<DPValue>(&R));
+ return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+ case LabelKind:
+ return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
};
llvm_unreachable("unsupported DbgRecord kind");
}
@@ -307,12 +323,16 @@ DbgRecord *DbgRecord::clone() const {
switch (RecordKind) {
case ValueKind:
return cast<DPValue>(this)->clone();
+ case LabelKind:
+ return cast<DPLabel>(this)->clone();
};
llvm_unreachable("unsupported DbgRecord kind");
}
DPValue *DPValue::clone() const { return new DPValue(*this); }
+DPLabel *DPLabel::clone() const { return new DPLabel(Label, getDebugLoc()); }
+
DbgVariableIntrinsic *
DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
[[maybe_unused]] DICompileUnit *Unit =
@@ -368,6 +388,20 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
return DVI;
}
+DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
+ Instruction *InsertBefore) const {
+ auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
+ Value *Args[] = {
+ MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
+ DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
+ CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args));
+ DbgLabel->setTailCall();
+ DbgLabel->setDebugLoc(getDebugLoc());
+ if (InsertBefore)
+ DbgLabel->insertBefore(InsertBefore);
+ return DbgLabel;
+}
+
Value *DPValue::getAddress() const {
auto *MD = getRawAddress();
if (auto *V = dyn_cast<ValueAsMetadata>(MD))
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index ce0df53..fc5c9b2 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3525,6 +3525,7 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
"Invalid cast");
unsigned SrcBits = C->getType()->getScalarSizeInBits();
unsigned DstBits = Ty->getScalarSizeInBits();
+ assert((C->getType() == Ty || SrcBits != DstBits) && "Invalid cast");
Instruction::CastOps opcode =
(SrcBits == DstBits ? Instruction::BitCast :
(SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 15c90a4..a0bf9ca 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -119,7 +119,9 @@ LLVMContextImpl::~LLVMContextImpl() {
IntZeroConstants.clear();
IntOneConstants.clear();
IntConstants.clear();
+ IntSplatConstants.clear();
FPConstants.clear();
+ FPSplatConstants.clear();
CDSConstants.clear();
// Destroy attribute node lists.
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 6a20291..2ee1080 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1488,8 +1488,12 @@ public:
DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntZeroConstants;
DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntOneConstants;
DenseMap<APInt, std::unique_ptr<ConstantInt>> IntConstants;
+ DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantInt>>
+ IntSplatConstants;
DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants;
+ DenseMap<std::pair<ElementCount, APFloat>, std::unique_ptr<ConstantFP>>
+ FPSplatConstants;
FoldingSet<AttributeImpl> AttrsSet;
FoldingSet<AttributeListImpl> AttrsLists;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 7b3a759..6cfe677 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -330,8 +330,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
report_fatal_error(Twine("unable to parse pass pipeline description '") +
Conf.OptPipeline + "': " + toString(std::move(Err)));
}
- } else if (Conf.UseDefaultPipeline) {
- MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
} else if (IsThinLTO) {
MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
} else {
diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp
index cb20fef..635cd83 100644
--- a/llvm/lib/Object/SymbolSize.cpp
+++ b/llvm/lib/Object/SymbolSize.cpp
@@ -65,6 +65,13 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
return Ret;
}
+ if (const auto *E = dyn_cast<WasmObjectFile>(&O)) {
+ for (SymbolRef Sym : E->symbols()) {
+ Ret.push_back({Sym, E->getSymbolSize(Sym)});
+ }
+ return Ret;
+ }
+
// Collect sorted symbol addresses. Include dummy addresses for the end
// of each section.
std::vector<SymEntry> Addresses;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f26d95a..fed7a14 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -91,6 +91,7 @@
#include "llvm/CodeGen/JMCInstrumenter.h"
#include "llvm/CodeGen/LowerEmuTLS.h"
#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/SafeStack.h"
#include "llvm/CodeGen/SelectOptimize.h"
#include "llvm/CodeGen/ShadowStackGCLowering.h"
@@ -1260,6 +1261,28 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
}
template <typename CallbacksT>
+static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
+ // Explicitly handle pass manager names.
+ if (Name == "machine-function")
+ return true;
+
+ // Explicitly handle custom-parsed pass names.
+ if (parseRepeatPassName(Name))
+ return true;
+
+#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) \
+ if (Name == NAME) \
+ return true;
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
+ if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">") \
+ return true;
+
+#include "llvm/Passes/MachinePassRegistry.def"
+
+ return callbacksAcceptPassName<MachineFunctionPassManager>(Name, Callbacks);
+}
+
+template <typename CallbacksT>
static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
bool &UseMemorySSA) {
UseMemorySSA = false;
@@ -1394,6 +1417,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
return Error::success();
}
+ if (Name == "machine-function") {
+ MachineFunctionPassManager MFPM;
+ if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline))
+ return Err;
+ MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+ return Error::success();
+ }
if (auto Params = parseFunctionPipelineName(Name)) {
if (Params->second)
return make_error<StringError>(
@@ -1874,8 +1904,8 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
}
#include "llvm/Passes/MachinePassRegistry.def"
- for (auto &C : MachinePipelineParsingCallbacks)
- if (C(Name, MFPM))
+ for (auto &C : MachineFunctionPipelineParsingCallbacks)
+ if (C(Name, MFPM, E.InnerPipeline))
return Error::success();
return make_error<StringError>(
formatv("unknown machine pass '{0}'", Name).str(),
@@ -1942,7 +1972,8 @@ Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
FunctionAnalysisManager &FAM,
CGSCCAnalysisManager &CGAM,
- ModuleAnalysisManager &MAM) {
+ ModuleAnalysisManager &MAM,
+ MachineFunctionAnalysisManager *MFAM) {
MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });
@@ -1950,6 +1981,14 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+ if (MFAM) {
+ MAM.registerPass(
+ [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); });
+ MFAM->registerPass(
+ [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); });
+ MFAM->registerPass(
+ [&] { return FunctionAnalysisManagerMachineFunctionProxy(FAM); });
+ }
}
Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
@@ -1991,6 +2030,9 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
UseMemorySSA)) {
Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
std::move(*Pipeline)}}}};
+ } else if (isMachineFunctionPassName(
+ FirstName, MachineFunctionPipelineParsingCallbacks)) {
+ Pipeline = {{"machine-function", std::move(*Pipeline)}};
} else {
for (auto &C : TopLevelPipelineParsingCallbacks)
if (C(MPM, *Pipeline))
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 142bd50..17b55b6 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -247,7 +247,7 @@ static cl::opt<bool>
static cl::opt<bool> EnableJumpTableToSwitch(
"enable-jump-table-to-switch",
- cl::desc("Enable JumpTableToSwitch pass (default = off)"));
+ cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true));
// This option is used in simplifying testing SampleFDO optimizations for
// profile loading.
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index c62582a..a99856d 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
// name (emitting the definition) can grab it from the metadata.
//
// FIXME: Handle functions with weak linkage?
- if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
+ if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
if (std::optional<std::string> MangledName =
getArm64ECMangledFunctionName(F.getName().str())) {
F.setMetadata("arm64ec_unmangled_name",
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 5b5ffd7..4fa719a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
TS->emitDirectiveVariantPCS(CurrentFnSym);
}
- if (TM.getTargetTriple().isWindowsArm64EC()) {
+ if (TM.getTargetTriple().isWindowsArm64EC() &&
+ !MF->getFunction().hasLocalLinkage()) {
// For ARM64EC targets, a function definition's name is mangled differently
// from the normal symbol. We emit the alias from the unmangled symbol to
// mangled symbol name here.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3485edb..5cc612e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
cl::desc("enable use of redzone on AArch64"),
cl::init(false), cl::Hidden);
-static cl::opt<bool>
- ReverseCSRRestoreSeq("reverse-csr-restore-seq",
- cl::desc("reverse the CSR restore sequence"),
- cl::init(false), cl::Hidden);
-
static cl::opt<bool> StackTaggingMergeSetTag(
"stack-tagging-merge-settag",
cl::desc("merge settag instruction in function epilog"), cl::init(true),
@@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
if (!EnableHomogeneousPrologEpilog)
return false;
- if (ReverseCSRRestoreSeq)
- return false;
if (EnableRedZone)
return false;
@@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+ if (homogeneousPrologEpilog(MF, &MBB)) {
+ auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
+ .setMIFlag(MachineInstr::FrameDestroy);
+ for (auto &RPI : RegPairs) {
+ MIB.addReg(RPI.Reg1, RegState::Define);
+ MIB.addReg(RPI.Reg2, RegState::Define);
+ }
+ return true;
+ }
+
+ // For performance reasons restore SVE register in increasing order
+ auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+ auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+ auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+ std::reverse(PPRBegin, PPREnd);
+ auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+ auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+ auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+ std::reverse(ZPRBegin, ZPREnd);
+
+ for (const RegPairInfo &RPI : RegPairs) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3191,42 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
-
- return MIB->getIterator();
- };
-
- // SVE objects are always restored in reverse order.
- for (const RegPairInfo &RPI : reverse(RegPairs))
- if (RPI.isScalable())
- EmitMI(RPI);
-
- if (homogeneousPrologEpilog(MF, &MBB)) {
- auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
- .setMIFlag(MachineInstr::FrameDestroy);
- for (auto &RPI : RegPairs) {
- MIB.addReg(RPI.Reg1, RegState::Define);
- MIB.addReg(RPI.Reg2, RegState::Define);
- }
- return true;
- }
-
- if (ReverseCSRRestoreSeq) {
- MachineBasicBlock::iterator First = MBB.end();
- for (const RegPairInfo &RPI : reverse(RegPairs)) {
- if (RPI.isScalable())
- continue;
- MachineBasicBlock::iterator It = EmitMI(RPI);
- if (First == MBB.end())
- First = It;
- }
- if (First != MBB.end())
- MBB.splice(MBBI, &MBB, First);
- } else {
- for (const RegPairInfo &RPI : RegPairs) {
- if (RPI.isScalable())
- continue;
- (void)EmitMI(RPI);
- }
}
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 184ebc1..3b92e95 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFPARMv8())
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFPARMv8())
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
@@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
- setOperationAction(ISD::BITCAST, MVT::i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::f16, Custom);
- setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+ if (Subtarget->hasFPARMv8()) {
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+ }
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 436b21f..bec1348 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1308,6 +1308,8 @@ private:
bool preferScalarizeSplat(SDNode *N) const override;
unsigned getMinimumJumpTableEntries() const override;
+
+ bool softPromoteHalfType() const override { return true; }
};
namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 0ae9a69..1c577a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>;
def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>;
def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
-def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 961dded..ef7c517 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -21,7 +21,6 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/StackSafetyAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
for (auto &I : SInfo.AllocasToInstrument) {
memtag::AllocaInfo &Info = I.second;
assert(Info.AI && SIB.isInterestingAlloca(*Info.AI));
- TrackingVH<Instruction> OldAI = Info.AI;
memtag::alignAndPadAlloca(Info, kTagGranuleSize);
AllocaInst *AI = Info.AI;
int Tag = NextTag;
@@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
ConstantInt::get(IRB.getInt64Ty(), Tag)});
if (Info.AI->hasName())
TagPCall->setName(Info.AI->getName() + ".tag");
- Info.AI->replaceAllUsesWith(TagPCall);
+ // Does not replace metadata, so we don't have to handle DPValues.
+ Info.AI->replaceNonMetadataUsesWith(TagPCall);
TagPCall->setOperand(0, Info.AI);
// Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
for (auto *II : Info.LifetimeEnd)
II->eraseFromParent();
}
-
- // Fixup debug intrinsics to point to the new alloca.
- for (auto *DVI : Info.DbgVariableIntrinsics)
- DVI->replaceVariableLocationOp(OldAI, Info.AI);
- for (auto *DPV : Info.DbgVariableRecords)
- DPV->replaceVariableLocationOp(OldAI, Info.AI);
}
// If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6655931..010e569 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info);
+ case ISD::FREM:
+ // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
+ // those functions are not declared in the module.
+ if (!Ty->isVectorTy())
+ return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2..9218760 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
+ (defs root:$root),
+ (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)),
+ (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root,
+ [{ return matchFDivSqrtToRsqF16(*${root}); }]),
+ (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
@@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+ rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0d3b158..13d7510 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
return true;
}
-static const unsigned SPDenormModeBitField =
- AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+static constexpr unsigned SPDenormModeBitField =
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e9..82e17dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,9 @@ public:
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
+ bool matchFDivSqrtToRsqF16(MachineInstr &MI) const;
+ void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const;
+
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
@@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
+ MachineInstr &MI) const {
+ Register Sqrt = MI.getOperand(2).getReg();
+ return MRI.hasOneNonDBGUse(Sqrt);
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
+ MachineInstr &MI, const Register &X) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Y = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ uint32_t Flags = MI.getFlags();
+ Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+ .addUse(X)
+ .setMIFlags(Flags)
+ .getReg(0);
+ B.buildFMul(Dst, RSQ, Y, Flags);
+ MI.eraseFromParent();
+}
+
bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5b32b34..b7b471d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
if (trySkipId("hwreg", AsmToken::LParen)) {
OperandInfoTy HwReg(OPR_ID_UNKNOWN);
- OperandInfoTy Offset(OFFSET_DEFAULT_);
- OperandInfoTy Width(WIDTH_DEFAULT_);
+ OperandInfoTy Offset(HwregOffset::Default);
+ OperandInfoTy Width(HwregSize::Default);
if (parseHwregBody(HwReg, Offset, Width) &&
validateHwreg(HwReg, Offset, Width)) {
- ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+ ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
} else {
return ParseStatus::Failure;
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 894607d..e1cca17 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
}
+static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeDpp8FI(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \
uint64_t /*Addr*/, \
@@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
return DecoderUInt128(Lo, Hi);
}
-// The disassembler is greedy, so we need to check FI operand value to
-// not parse a dpp if the correct literal is not set. For dpp16 the
-// autogenerated decoder checks the dpp literal
-static bool isValidDPP8(const MCInst &MI) {
- using namespace llvm::AMDGPU::DPP;
- int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
- assert(FiIdx != -1);
- if ((unsigned)FiIdx >= MI.getNumOperands())
- return false;
- unsigned Fi = MI.getOperand(FiIdx).getImm();
- return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
-}
-
DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
@@ -460,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
Bytes = Bytes_.slice(0, MaxInstBytesNum);
- DecodeStatus Res = MCDisassembler::Fail;
+ // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
+ // there are fewer bytes left). This will be overridden on success.
+ Size = std::min((size_t)4, Bytes_.size());
+
do {
// ToDo: better to switch encoding length using some bit predicate
// but it is unknown yet, so try all we can
@@ -469,222 +465,147 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
- Res =
- tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
- MI, DecW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
- Res =
- tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
- MI, DecW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- const auto convertVOPDPP = [&]() {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
- convertVOP3PDPPInst(MI);
- } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
- convertVOPCDPPInst(MI); // Special VOP3 case
- } else {
- assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
- convertVOP3DPPInst(MI); // Regular VOP3 case
- }
- };
- Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
- MI, DecW, Address, CS);
- if (Res) {
- convertVOPDPP();
- break;
- }
- Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
- MI, DecW, Address, CS);
- if (Res) {
- convertVOPDPP();
- break;
- }
- Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
- if (Res)
+
+ if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+ DecW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+ DecW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
break;
}
+
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
- if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
- Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
- if (Res) {
- if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
- == -1)
- break;
- if (convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
- }
- }
-
- Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- Res = tryDecodeInst(DecoderTableDPP8GFX1164,
- DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
- break;
- MI = MCInst(); // clear
-
- Res = tryDecodeInst(DecoderTableDPP8GFX1264,
- DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
- if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+ tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
break;
- MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
- if (Res) break;
-
- Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
- MI, QW, Address, CS);
- if (Res) {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
- convertVOPCDPPInst(MI);
- break;
- }
-
- Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
- MI, QW, Address, CS);
- if (Res) {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
- convertVOPCDPPInst(MI);
+ if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
+ tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
break;
- }
-
- if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
- Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
- if (Res)
- break;
- }
// Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
// v_mad_mixhi_f16 for FMA variants. Try to decode using this special
// table first so we print the correct name.
- if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
- Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
+ tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
- Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
+ tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
- Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
- if (Res)
- break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+ tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI,
- QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+ Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI,
- QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+ Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
break;
- Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
break;
}
- // Reinitialize Bytes as DPP64 could have eaten too much
+ // Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
// Try decode 32-bit instruction
- if (Bytes.size() < 4) break;
- const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
- if (Res) break;
+ if (Bytes.size() >= 4) {
+ const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
+ break;
- if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
- Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
- if (Res)
+ if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
break;
- }
- if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
- Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
- if (Res) break;
- }
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+ tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
+ break;
+
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+ tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
- Address, CS);
- if (Res) break;
+ if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+ Address, CS))
+ break;
- Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
- Address, CS);
+ if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+ Address, CS))
+ break;
+ }
+
+ return MCDisassembler::Fail;
} while (false);
- if (Res && AMDGPU::isMAC(MI.getOpcode())) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
+ if (isMacDPP(MI))
+ convertMacDPPInst(MI);
+
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+ convertVOP3PDPPInst(MI);
+ else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
+ AMDGPU::isVOPC64DPP(MI.getOpcode()))
+ convertVOPCDPPInst(MI); // Special VOP3 case
+ else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
+ -1)
+ convertDPP8Inst(MI);
+ else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
+ convertVOP3DPPInst(MI); // Regular VOP3 case
+ }
+
+ if (AMDGPU::isMAC(MI.getOpcode())) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
- if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+ if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
!AMDGPU::hasGDS(STI)) {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::cpol);
if (CPolPos != -1) {
@@ -700,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
- (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
+ if ((MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+ (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
// GFX90A lost TFE, its place is occupied by ACC.
int TFEOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
@@ -713,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
int SWZOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
if (SWZOpIdx != -1) {
@@ -724,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int RsrcIdx =
@@ -732,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
if (VAddr0Idx >= 0 && NSAArgs > 0) {
unsigned NSAWords = (NSAArgs + 3) / 4;
- if (Bytes.size() < 4 * NSAWords) {
- Res = MCDisassembler::Fail;
- } else {
- for (unsigned i = 0; i < NSAArgs; ++i) {
- const unsigned VAddrIdx = VAddr0Idx + 1 + i;
- auto VAddrRCID =
- MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
- MI.insert(MI.begin() + VAddrIdx,
- createRegOperand(VAddrRCID, Bytes[i]));
- }
- Bytes = Bytes.slice(4 * NSAWords);
+ if (Bytes.size() < 4 * NSAWords)
+ return MCDisassembler::Fail;
+ for (unsigned i = 0; i < NSAArgs; ++i) {
+ const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+ auto VAddrRCID =
+ MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
+ MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
}
+ Bytes = Bytes.slice(4 * NSAWords);
}
- if (Res)
- Res = convertMIMGInst(MI);
+ convertMIMGInst(MI);
}
- if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
- Res = convertMIMGInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
+ convertMIMGInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
- Res = convertEXPInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
+ convertEXPInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
- Res = convertVINTERPInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
+ convertVINTERPInst(MI);
- if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA))
- Res = convertSDWAInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
+ convertSDWAInst(MI);
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
@@ -782,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
int ImmLitIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
- if (Res && ImmLitIdx != -1 && !IsSOPK)
- Res = convertFMAanyK(MI, ImmLitIdx);
+ if (ImmLitIdx != -1 && !IsSOPK)
+ convertFMAanyK(MI, ImmLitIdx);
- // if the opcode was not recognized we'll assume a Size of 4 bytes
- // (unless there are fewer bytes left)
- Size = Res ? (MaxInstBytesNum - Bytes.size())
- : std::min((size_t)4, Bytes_.size());
- return Res;
+ Size = MaxInstBytesNum - Bytes.size();
+ return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
// The MCInst still has these fields even though they are no longer encoded
// in the GFX11 instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
}
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
@@ -815,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
// instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
}
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
STI.hasFeature(AMDGPU::FeatureGFX10)) {
if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
@@ -835,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
}
}
- return MCDisassembler::Success;
}
struct VOPModifiers {
@@ -939,56 +850,40 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
AMDGPU::OpName::src2_modifiers);
}
-// We must check FI == literal to reject not genuine dpp8 insts, and we must
-// first add optional MI operands to check FI
-DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
- if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
- convertVOP3PDPPInst(MI);
- } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
- AMDGPU::isVOPC64DPP(Opc)) {
- convertVOPCDPPInst(MI);
- } else {
- if (isMacDPP(MI))
- convertMacDPPInst(MI);
+ int VDstInIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+ if (VDstInIdx != -1)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
- int VDstInIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
- if (VDstInIdx != -1)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
- if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
- MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
- insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ convertTrue16OpSel(MI);
+ auto Mods = collectVOPModifiers(MI);
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+ AMDGPU::OpName::op_sel);
+ } else {
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
- unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
- convertTrue16OpSel(MI);
- auto Mods = collectVOPModifiers(MI);
- insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
- AMDGPU::OpName::op_sel);
- } else {
- // Insert dummy unused src modifiers.
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src0_modifiers);
-
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src1_modifiers);
- }
+ AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
}
- return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
}
-DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
- if (isMacDPP(MI))
- convertMacDPPInst(MI);
-
+void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
convertTrue16OpSel(MI);
int VDstInIdx =
@@ -1008,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
AMDGPU::OpName::op_sel);
}
- return MCDisassembler::Success;
}
// Note that before gfx10, the MIMG encoding provided no information about
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
-DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -1043,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (BaseOpcode->BVH) {
// Add A16 operand for intersect_ray instructions
addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
- return MCDisassembler::Success;
+ return;
}
bool IsAtomic = (VDstIdx != -1);
@@ -1078,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
// The NSA encoding does not contain enough operands for the
// combination of base opcode / dimension. Should this be an error?
- return MCDisassembler::Success;
+ return;
}
IsPartialNSA = true;
}
@@ -1097,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
DstSize += 1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
- return MCDisassembler::Success;
+ return;
int NewOpcode =
AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
if (NewOpcode == -1)
- return MCDisassembler::Success;
+ return;
// Widen the register to the correct number of enabled channels.
unsigned NewVdata = AMDGPU::NoRegister;
@@ -1119,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (NewVdata == AMDGPU::NoRegister) {
// It's possible to encode this such that the low register + enabled
// components exceeds the register count.
- return MCDisassembler::Success;
+ return;
}
}
@@ -1137,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
&MRI.getRegClass(AddrRCID));
if (!NewVAddrSA)
- return MCDisassembler::Success;
+ return;
}
MI.setOpcode(NewOpcode);
@@ -1158,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
MI.erase(MI.begin() + VAddr0Idx + AddrSize,
MI.begin() + VAddr0Idx + Info->VAddrDwords);
}
-
- return MCDisassembler::Success;
}
// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
// decoder only adds to src_modifiers, so manually add the bits to the other
// operands.
-DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
auto Mods = collectVOPModifiers(MI, true);
@@ -1190,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
AMDGPU::OpName::neg_hi);
-
- return MCDisassembler::Success;
}
// Create dummy old operand and insert optional operands
-DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
@@ -1212,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src1_modifiers);
- return MCDisassembler::Success;
}
-DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
- int ImmLitIdx) const {
+void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
assert(HasLiteral && "Should have decoded a literal");
const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
unsigned DescNumOps = Desc.getNumOperands();
@@ -1232,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
IsDeferredOp)
Op.setImm(Literal);
}
- return MCDisassembler::Success;
}
const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
@@ -1831,6 +1718,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
return decodeSrcOp(OPW32, Val);
}
+MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
+ if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
+ return MCOperand();
+ return MCOperand::createImm(Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3142b8a..2e1b6fb 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -194,15 +194,15 @@ public:
DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer,
raw_string_ostream &KdStream) const;
- DecodeStatus convertEXPInst(MCInst &MI) const;
- DecodeStatus convertVINTERPInst(MCInst &MI) const;
- DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
- DecodeStatus convertSDWAInst(MCInst &MI) const;
- DecodeStatus convertDPP8Inst(MCInst &MI) const;
- DecodeStatus convertMIMGInst(MCInst &MI) const;
- DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
- DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
- DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
+ void convertEXPInst(MCInst &MI) const;
+ void convertVINTERPInst(MCInst &MI) const;
+ void convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
+ void convertSDWAInst(MCInst &MI) const;
+ void convertDPP8Inst(MCInst &MI) const;
+ void convertMIMGInst(MCInst &MI) const;
+ void convertVOP3DPPInst(MCInst &MI) const;
+ void convertVOP3PDPPInst(MCInst &MI) const;
+ void convertVOPCDPPInst(MCInst &MI) const;
void convertMacDPPInst(MCInst &MI) const;
void convertTrue16OpSel(MCInst &MI) const;
@@ -261,6 +261,7 @@ public:
MCOperand decodeBoolReg(unsigned Val) const;
MCOperand decodeSplitBarrier(unsigned Val) const;
+ MCOperand decodeDpp8FI(unsigned Val) const;
int getTTmpIdx(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a727134..00fa93c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) {
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
- return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+ return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
}
ScheduleHazardRecognizer::HazardType
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a45fea6..a32be1e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- unsigned Id;
- unsigned Offset;
- unsigned Width;
-
using namespace llvm::AMDGPU::Hwreg;
unsigned Val = MI->getOperand(OpNo).getImm();
- decodeHwreg(Val, Id, Offset, Width);
+ auto [Id, Offset, Width] = HwregEncoding::decode(Val);
StringRef HwRegName = getHwreg(Id, STI);
O << "hwreg(";
@@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
} else {
O << Id;
}
- if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
+ if (Width != HwregSize::Default || Offset != HwregOffset::Default)
O << ", " << Offset << ", " << Width;
- }
O << ')';
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 98310c3..0b516bf 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0]
ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
-
- ID_SHIFT_ = 0,
- ID_WIDTH_ = 6,
- ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
};
enum Offset : unsigned { // Offset, (5) [10:6]
- OFFSET_DEFAULT_ = 0,
- OFFSET_SHIFT_ = 6,
- OFFSET_WIDTH_ = 5,
- OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
-
OFFSET_MEM_VIOL = 8,
};
-enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
- WIDTH_M1_DEFAULT_ = 31,
- WIDTH_M1_SHIFT_ = 11,
- WIDTH_M1_WIDTH_ = 5,
- WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
-};
-
-// Some values from WidthMinusOne mapped into Width domain.
-enum Width : unsigned {
- WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
-};
-
enum ModeRegisterMasks : uint32_t {
FP_ROUND_MASK = 0xf << 0, // Bits 0..3
FP_DENORM_MASK = 0xf << 4, // Bits 4..7
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee7..4f106bf 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
- addReg(FlatScrInitLo).
- addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
- (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
- addReg(FlatScrInitHi).
- addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
- (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ using namespace AMDGPU::Hwreg;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+ .addReg(FlatScrInitLo)
+ .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+ .addReg(FlatScrInitHi)
+ .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 257dff6..d8f528d8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
assert(Op.getValueType() == MVT::i32);
uint32_t BothRoundHwReg =
- AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
SDValue IntrinID =
@@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock::iterator I = LoopBB->end();
- const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
- AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+ const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
// Clear TRAP_STS.MEM_VIOL
BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
@@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// Otherwise there was overflow and the result is hi2:0. In both cases the
// result should represent the actual time at some point during the sequence
// of three getregs.
+ using namespace AMDGPU::Hwreg;
Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
- .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
- 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
- .addImm(
- AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
- .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
- 0, 32));
+ .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(RegHi1)
.addReg(RegHi2);
@@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// FIXME: This could be predicates on the immediate, but tablegen doesn't
// allow you to have a no side effect instruction in the output of a
// sideeffecting pattern.
- unsigned ID, Offset, Width;
- AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+ auto [ID, Offset, Width] =
+ AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
if (ID != AMDGPU::Hwreg::ID_MODE)
return BB;
@@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
DenominatorScaled, Flags);
- const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
- (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+ using namespace AMDGPU::Hwreg;
+ const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
const MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8..a6184c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -480,6 +480,10 @@ public:
// WaitEventType to corresponding counter values in InstCounterType.
virtual const unsigned *getWaitEventMask() const = 0;
+ // Returns a new waitcnt with all counters except VScnt set to 0. If
+ // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
virtual ~WaitcntGenerator() = default;
};
@@ -516,6 +520,8 @@ public:
return WaitEventMaskForInstPreGFX12;
}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
@@ -549,6 +555,8 @@ public:
return WaitEventMaskForInstGFX12Plus;
}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
return Modified;
}
+AMDGPU::Waitcnt
+WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+AMDGPU::Waitcnt
+WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
/// were added by previous passes. Currently this pass conservatively
@@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(
- AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+ Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
@@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
- Wait = Wait.combined(
- AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
+ Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+ Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
if (ForceEmitWaitcnt[LOAD_CNT])
Wait.LoadCnt = 0;
@@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
ScoreBrackets->applyWaitcnt(
- AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+ WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else {
// May need to way wait for anything.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 97c7237..34cdb09 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC {
}
class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
- string ConvertMethod = "nullptr">
- : CustomOperand<Type, Optional, NAME> {
+ string name = NAME, string ConvertMethod = "nullptr">
+ : CustomOperand<Type, Optional, name> {
let ParserMethod =
"[this](OperandVector &Operands) -> ParseStatus { "#
"return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in {
def DppRowMask : NamedIntOperand<i32, "row_mask">;
def DppBankMask : NamedIntOperand<i32, "bank_mask">;
}
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1,
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
"[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
-def DppFI : NamedIntOperand<i32, "fi">;
+
+let DecoderMethod = "decodeDpp8FI" in
+def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
def blgp : CustomOperand<i32, 1, "BLGP">;
def CBSZ : NamedIntOperand<i32, "cbsz">;
@@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
- (ins DppFI:$fi));
+ (ins Dpp16FI:$fi));
}
class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
- (ins dpp8:$dpp8, DppFI:$fi));
+ (ins dpp8:$dpp8, Dpp8FI:$fi));
}
class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
@@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has
class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
- (ins DppFI:$fi));
+ (ins Dpp16FI:$fi));
}
class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
- (ins dpp8:$dpp8, DppFI:$fi));
+ (ins dpp8:$dpp8, Dpp8FI:$fi));
}
// Ins for SDWA
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index e62ad02..c01b126 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask);
unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset);
unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+ using namespace AMDGPU::Hwreg;
BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
.addImm(Value)
- .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
- (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+ .addImm(HwregEncoding::encode(ID_MODE, Offset, Width));
++NumSetregInserted;
Changed = true;
InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
@@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
// as we assume it has been inserted by a higher authority (this is
// likely to be a very rare occurrence).
unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
- if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
- AMDGPU::Hwreg::ID_MODE)
+ using namespace AMDGPU::Hwreg;
+ auto [Id, Offset, Width] = HwregEncoding::decode(Dst);
+ if (Id != ID_MODE)
continue;
- unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
- AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
- 1;
- unsigned Offset =
- (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset;
// If an InsertionPoint is set we will insert a setreg there.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dacdf7b..ce91e05 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
return (Idx < 0) ? Idx : Opr[Idx].Encoding;
}
-bool isValidHwreg(int64_t Id) {
- return 0 <= Id && isUInt<ID_WIDTH_>(Id);
-}
+bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
bool isValidHwregOffset(int64_t Offset) {
- return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+ return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
}
bool isValidHwregWidth(int64_t Width) {
- return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
-}
-
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
- return (Id << ID_SHIFT_) |
- (Offset << OFFSET_SHIFT_) |
- ((Width - 1) << WIDTH_M1_SHIFT_);
+ return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
}
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
@@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
return (Idx < 0) ? "" : Opr[Idx].Name;
}
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
- Id = (Val & ID_MASK_) >> ID_SHIFT_;
- Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
- Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
-}
-
} // namespace Hwreg
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f35e7744..6826cd2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
} // end namespace IsaInfo
+// Represents a field in an encoded value.
+template <unsigned HighBit, unsigned LowBit, unsigned D = 0>
+struct EncodingField {
+ static_assert(HighBit >= LowBit, "Invalid bit range!");
+ static constexpr unsigned Offset = LowBit;
+ static constexpr unsigned Width = HighBit - LowBit + 1;
+
+ using ValueType = unsigned;
+ static constexpr ValueType Default = D;
+
+ ValueType Value;
+ constexpr EncodingField(ValueType Value) : Value(Value) {}
+
+ constexpr uint64_t encode() const { return Value; }
+ static ValueType decode(uint64_t Encoded) { return Encoded; }
+};
+
+// A helper for encoding and decoding multiple fields.
+template <typename... Fields> struct EncodingFields {
+ static constexpr uint64_t encode(Fields... Values) {
+ return ((Values.encode() << Values.Offset) | ...);
+ }
+
+ static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) {
+ return {Fields::decode((Encoded >> Fields::Offset) &
+ maxUIntN(Fields::Width))...};
+ }
+};
+
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
@@ -870,15 +899,6 @@ struct Waitcnt {
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
- static Waitcnt allZero(bool Extended, bool HasStorecnt) {
- return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
- : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
- }
-
- static Waitcnt allZeroExceptVsCnt(bool Extended) {
- return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
- }
-
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
@@ -1030,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
namespace Hwreg {
+using HwregId = EncodingField<5, 0>;
+using HwregOffset = EncodingField<10, 6>;
+
+struct HwregSize : EncodingField<15, 11, 32> {
+ using EncodingField::EncodingField;
+ constexpr uint64_t encode() const { return Value - 1; }
+ static ValueType decode(uint64_t Encoded) { return Encoded + 1; }
+};
+
+using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
+
LLVM_READONLY
int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
@@ -1043,13 +1074,8 @@ LLVM_READNONE
bool isValidHwregWidth(int64_t Width);
LLVM_READNONE
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
-
-LLVM_READNONE
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
-
} // namespace Hwreg
namespace DepCtr {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 99f8e8e..f5424cf 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
let OutsDPP = (outs Src0RC32:$vdst);
let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
- let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi);
+ let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi);
let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret;
let OutsVOP3DPP = (outs Src0RC64:$vdst);
@@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p
class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
}
//===----------------------------------------------------------------------===//
@@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16,
- DecoderNamespace = "DPP" # Gen.DecoderNamespace #
+ DecoderNamespace = Gen.DecoderNamespace #
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
defm NAME : VOP1_Real_dpp<Gen, op, opName>;
}
@@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8,
- DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
+ DecoderNamespace = Gen.DecoderNamespace #
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
}
@@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
- def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "DPP8";
- }
+ def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
@@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
let Inst{31-25} = 0x3f; //encoding
}
-multiclass VOP1Only_Real_vi <bits<10> op> {
- let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+ multiclass VOP1Only_Real_vi <bits<10> op> {
def _vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
}
-}
-multiclass VOP1_Real_e32e64_vi <bits<10> op> {
- let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -1389,44 +1385,41 @@ def : GCNPat <
// GFX9
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_gfx9 <bits<10> op> {
- let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
- }
-
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
-
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx9 :
- VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
-
-}
-multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
- let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
- defm NAME : VOP1_Real_e32e64_vi <op>;
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
}
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
- let Inst{42-40} = 6;
- }
+ multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
- if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx9 :
- VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let Inst{42-40} = 6;
+ }
+
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+ }
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
-let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+let AssemblerPredicate = isGFX940Plus in
defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
let OtherPredicates = [HasFP8ConversionInsts] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 4437d5f..13fe79b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
@@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Src2Mod = FP32InputMods; // dummy unused modifiers
let Src2RC64 = VGPRSrc_32; // stub argument
}
@@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
Src0DPP:$src0,
Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let OutsVOP3DPP = Outs64;
@@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1>
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
Src0DPP:$src0,
Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
@@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
FPVRegInputMods:$src0_modifiers, Src0DPP:$src0,
FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
- dpp8:$dpp8, DppFI:$fi);
+ dpp8:$dpp8, Dpp8FI:$fi);
let Src0ModVOP3DPP = FPVRegInputMods;
let Src1ModVOP3DPP = FPVRegInputMods;
@@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen,
VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
VOP2_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
- def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "DPP8";
- }
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
}
//===------------------------- VOP2 (with name) -------------------------===//
@@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8";
}
}
@@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
- let DecoderNamespace = "DPP8";
}
if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_w32_gfx10 :
@@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
-let AssemblerPredicate = isGFX8Only in {
+let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
def _e32_vi :
@@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX8";
}
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX8";
}
if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
@@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
let AsmString = AsmName # ps.AsmOperands;
}
}
-}
-let AssemblerPredicate = isGFX9Only in {
+} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8"
+
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
def _e32_gfx9 :
@@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
def _e64_gfx9 :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
@@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "GFX9";
}
}
multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
def _e32_gfx9 :
VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
- let DecoderNamespace = "GFX9";
- }
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
def _e64_gfx9 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
- VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- let DecoderNamespace = "GFX9";
- }
+ VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
- VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
- let DecoderNamespace = "GFX9";
- }
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
-} // AssemblerPredicate = isGFX9Only
+} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 396ae9c..7198a40 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
VGPR_32:$vdst_in, op_sel0:$op_sel,
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let InsVOP3DPP8 = (ins VGPR_32:$old,
FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
- VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+ VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
let HasClamp = 0;
let HasExtVOP3DPP = 1;
@@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
FP32InputMods:$src2_modifiers, VGPR_32:$src2,
op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
- DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
let InsVOP3DPP8 = (ins VGPR_32:$old,
FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
FP32InputMods:$src2_modifiers, VGPR_32:$src2,
- op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+ op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
let HasClamp = 0;
let HasSrc2 = 0;
let HasSrc2Mods = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 74f451b..ac3c8f9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
- neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi);
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi);
let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
DppRowMask:$row_mask, DppBankMask:$bank_mask,
- DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+ DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
}
multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
@@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
let OtherPredicates = ps.OtherPredicates;
+ let IsPacked = ps.IsPacked;
}
class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
+ let IsPacked = ps.IsPacked;
}
//===----------------------------------------------------------------------===//
@@ -1486,7 +1488,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
: VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
Gen.Subtarget> {
let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1496,7 +1498,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> {
let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1613,7 +1615,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
- let SubtargetPredicate = isGFX940Plus,
+ let AssemblerPredicate = isGFX940Plus,
DecoderNamespace = "GFX940",
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index fe52a0e..e5e8244 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
let AsmVariantName = AMDGPUAsmVariants.Default;
let SubtargetPredicate = AssemblerPredicate;
+
+ string DecoderNamespace; // dummy
}
multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> {
@@ -766,7 +768,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP16 = AsmDPP#"$fi";
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+ let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
@@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
//===----------------------------------------------------------------------===//
multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
- VOPCe<op{7-0}>;
- def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
+ VOPCe<op{7-0}>;
+ def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
- def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
- let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
- let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
+ def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
- def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
+ def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
- def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
+ def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
string asm_name, string pseudo_mnemonic = ""> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix :
- // 32 and 64 bit forms of the instruction have _e32 and _e64
- // respectively appended to their assembly mnemonic.
- // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
- // the destination-less 32bit forms add it to the asmString here.
- VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
- VOPCe<op{7-0}>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
- pseudo_mnemonic),
- asm_name, ps32.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>;
- def _e64#Gen.Suffix :
- VOP3_Real<ps64, Gen.Subtarget, asm_name>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
- pseudo_mnemonic),
- asm_name, ps64.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix :
+ // 32 and 64 bit forms of the instruction have _e32 and _e64
+ // respectively appended to their assembly mnemonic.
+ // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+ // the destination-less 32bit forms add it to the asmString here.
+ VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
+ VOPCe<op{7-0}>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
+ pseudo_mnemonic),
+ asm_name, ps32.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>;
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
+ pseudo_mnemonic),
+ asm_name, ps64.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- Gen.Subtarget, asm_name>;
- def _e32_dpp_w32#Gen.Suffix
- : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp_w64#Gen.Suffix
- : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
+ def _e32_dpp_w32#Gen.Suffix
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp_w64#Gen.Suffix
+ : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
- def _e32_dpp8_w32#Gen.Suffix
- : VOPC_DPP8<op{7-0}, ps32, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e32_dpp8_w64#Gen.Suffix
- : VOPC_DPP8<op{7-0}, ps32, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ def _e32_dpp8_w32#Gen.Suffix
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e32_dpp8_w64#Gen.Suffix
+ : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp_w64#Gen.Suffix
+ : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
- def _e64_dpp8_w32#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+ def _e64_dpp8_w32#Gen.Suffix
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ def _e64_dpp8_w64#Gen.Suffix
+ : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # " vcc, " # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix :
- VOPC_Real<ps32, Gen.Subtarget>,
- VOPCe<op{7-0}> {
- let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
- # " " # ps32.AsmOperands;
- }
- def _e64#Gen.Suffix :
- VOP3_Real<ps64, Gen.Subtarget>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- let Inst{7-0} = ?; // sdst
- let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
- # "{_e64} " # ps64.AsmOperands;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix :
+ VOPC_Real<ps32, Gen.Subtarget>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+ # " " # ps32.AsmOperands;
+ }
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+ # "{_e64} " # ps64.AsmOperands;
+ }
defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix
- : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
- let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
- }
+ def _e32_dpp#Gen.Suffix
+ : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
- let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
- }
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+ let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix
- : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
- let AsmString = !subst("_nosdst", "", psDPP.OpName)
- # "{_e64_dpp} " # AsmDPP;
- }
+ def _e64_dpp#Gen.Suffix
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+ let AsmString = !subst("_nosdst", "", psDPP.OpName)
+ # "{_e64_dpp} " # AsmDPP;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
- let AsmString = !subst("_nosdst", "", ps64.OpName)
- # "{_e64_dpp} " # AsmDPP8;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+ let AsmString = !subst("_nosdst", "", ps64.OpName)
+ # "{_e64_dpp} " # AsmDPP8;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
string asm_name, string pseudo_mnemonic = ""> {
- let AssemblerPredicate = Gen.AssemblerPredicate in {
+ let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
- let DecoderNamespace = Gen.DecoderNamespace in {
- def _e32#Gen.Suffix
- : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
- pseudo_mnemonic),
- asm_name, ps32.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>,
- VOPCe<op{7-0}> {
- let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
- }
- def _e64#Gen.Suffix
- : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
- MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
- pseudo_mnemonic),
- asm_name, ps64.AsmVariantName>,
- Requires<[Gen.AssemblerPredicate]>,
- VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
- let Inst{7-0} = ? ; // sdst
- let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
- }
- } // End DecoderNamespace = Gen.DecoderNamespace
+ def _e32#Gen.Suffix
+ : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
+ pseudo_mnemonic),
+ asm_name, ps32.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>,
+ VOPCe<op{7-0}> {
+ let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+ }
+ def _e64#Gen.Suffix
+ : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+ MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
+ pseudo_mnemonic),
+ asm_name, ps64.AsmVariantName>,
+ Requires<[Gen.AssemblerPredicate]>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+ let Inst{7-0} = ? ; // sdst
+ let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+ }
defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- Gen.Subtarget, asm_name>;
- }
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
- }
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
- def _e64_dpp#Gen.Suffix
- : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
- let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
- }
+ def _e64_dpp#Gen.Suffix
+ : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
- def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
- }
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+ let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
}
}
- } // AssemblerPredicate = Gen.AssemblerPredicate
+ } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>;
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Only in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass VOPC_Real_gfx10<bits<9> op> {
- let DecoderNamespace = "GFX10" in {
- def _e32_gfx10 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
- VOPCe<op{7-0}>;
- def _e64_gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = "GFX10"
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
@@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in {
}
multiclass VOPCX_Real_gfx10<bits<9> op> {
- let DecoderNamespace = "GFX10" in {
- def _e32_gfx10 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
- VOPCe<op{7-0}> {
- let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
- # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
- }
-
- def _e64_gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
- VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
- let Inst{7-0} = ?; // sdst
- let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
- # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
- }
- } // End DecoderNamespace = "GFX10"
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+ # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+ }
+
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+ # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ }
if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
@@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in {
defm : VOPCXInstAliases<NAME, "gfx10">;
}
-} // End AssemblerPredicate = isGFX10Only
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>;
defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>;
@@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>;
// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX6GFX7 in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
- let DecoderNamespace = "GFX6GFX7" in {
- def _e32_gfx6_gfx7 :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOPCe<op{7-0}>;
- def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3 differs from
- // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
- } // End DecoderNamespace = "GFX6GFX7"
+ def _e32_gfx6_gfx7 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
}
-} // End AssemblerPredicate = isGFX6GFX7
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 801afab..80d7d96 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let VALU = 1;
let DPP = 1;
let Size = 8;
+ let IsPacked = P.IsPacked;
let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
@@ -835,7 +836,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
- let DecoderNamespace = "DPP";
+ let DecoderNamespace = "GFX8";
VOPProfile Pfl = P;
}
@@ -906,7 +907,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
- let DecoderNamespace = "DPP";
+ let DecoderNamespace = "GFX8";
}
class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -1350,7 +1351,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ let DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
@@ -1463,7 +1464,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName,
multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1473,7 +1474,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME>
def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
let Inst{11} = ?;
let Inst{12} = ?;
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1482,7 +1483,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.Pfl.AsmVOP3DPP8,
- DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ DecoderNamespace = Gen.DecoderNamespace#
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
NoTrue16Predicate) in {
@@ -1505,7 +1506,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName,
defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> {
- let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1514,7 +1515,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> {
- let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let DecoderNamespace = Gen.DecoderNamespace;
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index c5199aab..00a29f8 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -25,42 +25,6 @@
using namespace llvm;
using namespace LegalizeActions;
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void addAndInterleaveWithUnsupported(
- LegacyLegalizerInfo::SizeAndActionsVec &result,
- const LegacyLegalizerInfo::SizeAndActionsVec &v) {
- for (unsigned i = 0; i < v.size(); ++i) {
- result.push_back(v[i]);
- if (i + 1 < v[i].first && i + 1 < v.size() &&
- v[i + 1].first != v[i].first + 1)
- result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
- }
-}
-
-static LegacyLegalizerInfo::SizeAndActionsVec
-widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
- assert(v.size() >= 1);
- assert(v[0].first > 17);
- LegacyLegalizerInfo::SizeAndActionsVec result = {
- {1, LegacyLegalizeActions::Unsupported},
- {8, LegacyLegalizeActions::WidenScalar},
- {9, LegacyLegalizeActions::Unsupported},
- {16, LegacyLegalizeActions::WidenScalar},
- {17, LegacyLegalizeActions::Unsupported}};
- addAndInterleaveWithUnsupported(result, v);
- auto Largest = result.back().first;
- result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
- return result;
-}
-
static bool AEABI(const ARMSubtarget &ST) {
return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
}
@@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
.libcallFor({s32})
.clampScalar(0, s32, s32);
- for (unsigned Op : {G_SREM, G_UREM}) {
- LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
- if (HasHWDivide)
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower);
- else if (AEABI(ST))
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom);
- else
- LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall);
- }
+ auto &REMBuilder =
+ getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32);
+ if (HasHWDivide)
+ REMBuilder.lowerFor({s32});
+ else if (AEABI(ST))
+ REMBuilder.customFor({s32});
+ else
+ REMBuilder.libcallFor({s32});
getActionDefinitionsBuilder(G_INTTOPTR)
.legalFor({{p0, s32}})
@@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
LoadStoreBuilder.maxScalar(0, s32);
- for (auto Ty : {s32, s64})
- LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower);
+ getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64});
getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 5215813..8a3454c 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">;
def UnaryFloatCategory : DXILOpCategory<"Unary float">;
def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
-// Following are the scalar types supported by DXIL operations and are synonymous
-// to llvm_*_ty defined for readability and ease of use in the context of this file.
-
-def voidTy : LLVMType<isVoid>;
-
-// Floating point types
-def f16Ty : LLVMType<f16>;
-def f32Ty : LLVMType<f32>;
-def f64Ty : LLVMType<f64>;
-
-// Integer types
-def i1Ty : LLVMType<i1>;
-def i8Ty : LLVMType<i8>;
-def i16Ty : LLVMType<i16>;
-def i32Ty : LLVMType<i32>;
-def i64Ty : LLVMType<i64>;
+// Represent as any pointer type with an option to change to a qualified pointer
+// type with address space specified.
+def dxil_handle_ty : LLVMAnyPointerType;
+def dxil_cbuffer_ty : LLVMAnyPointerType;
+def dxil_resource_ty : LLVMAnyPointerType;
// The parameter description for a DXIL operation
-class DXILOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, LLVMType type, string name, string doc,
bit isConstant = 0, string enumName = "",
int maxValue = 0> {
int Pos = pos; // Position in parameter list
- string Type = type; // LLVM type name, $o for overload, $r for resource
- // type, $cb for legacy cbuffer, $u4 for u4 struct
+ LLVMType ParamType = type; // Parameter type
string Name = name; // Short, unique parameter name
string Doc = doc; // Description of this parameter
bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR
@@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
- [f16Ty,f32Ty], ReadNone,
+ [llvm_half_ty, llvm_float_ty], ReadNone,
[
- DXILOpParameter<0, "$o", "", "operation result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "$o", "value", "input value">
+ DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value">
],
["floats"]>,
LLVMIntrinsic<int_sin>;
-def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
- [i16Ty,i32Ty,i64Ty], ReadNone,
+def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+ [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone,
[
- DXILOpParameter<0, "$o", "", "operation result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "$o", "a", "input value">,
- DXILOpParameter<3, "$o", "b", "input value">
+ DXILOpParameter<0, llvm_anyint_ty, "", "operation result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_anyint_ty, "a", "input value">,
+ DXILOpParameter<3, llvm_anyint_ty, "b", "input value">
],
["uints"]>,
LLVMIntrinsic<int_umax>;
-def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty], ReadNone,
+def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "thread ID component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read (x,y,z)">
+ DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
]>,
LLVMIntrinsic<int_dx_thread_id>;
-def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty], ReadNone,
+def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "group ID component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read">
+ DXILOpParameter<0, llvm_i32_ty, "", "group ID component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read">
]>,
LLVMIntrinsic<int_dx_group_id>;
-def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
- "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty], ReadNone,
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
+ "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "thread ID in group component">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
- DXILOpParameter<2, "i32", "component", "component to read (x,y,z)">
+ DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+ DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
]>,
LLVMIntrinsic<int_dx_thread_id_in_group>;
-def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
- "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty], ReadNone,
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
+ "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone,
[
- DXILOpParameter<0, "i32", "", "result">,
- DXILOpParameter<1, "i32", "opcode", "DXIL opcode">
+ DXILOpParameter<0, llvm_i32_ty, "", "result">,
+ DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">
]>,
LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 76f99b4..2870f0b 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen
HexagonFrameLowering.cpp
HexagonGenExtract.cpp
HexagonGenInsert.cpp
+ HexagonGenMemAbsolute.cpp
HexagonGenMux.cpp
HexagonGenPredicate.cpp
HexagonHardwareLoops.cpp
@@ -50,6 +51,7 @@ add_llvm_target(HexagonCodeGen
HexagonOptAddrMode.cpp
HexagonOptimizeSZextends.cpp
HexagonPeephole.cpp
+ HexagonPostIncOpt.cpp
HexagonRDFOpt.cpp
HexagonRegisterInfo.cpp
HexagonSelectionDAGInfo.cpp
@@ -60,6 +62,7 @@ add_llvm_target(HexagonCodeGen
HexagonTargetMachine.cpp
HexagonTargetObjectFile.cpp
HexagonTargetTransformInfo.cpp
+ HexagonTfrCleanup.cpp
HexagonVectorCombine.cpp
HexagonVectorLoopCarriedReuse.cpp
HexagonVectorPrint.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6024d9f..3b8234c 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
return false;
const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
RegHalf H;
- if (!matchHalf(0, RC, 0, H))
+ unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0;
+ if (!matchHalf(0, RC, B, H))
return false;
if (H.Low)
return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
new file mode 100644
index 0000000..afd4963
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -0,0 +1,274 @@
+//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This pass traverses through all the basic blocks in a function and converts
+// an indexed load/store with offset "0" to a absolute-set load/store
+// instruction as long as the use of the register in the new instruction
+// dominates the rest of the uses and there are more than 2 uses.
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "hexagon-abs"
+
+using namespace llvm;
+
+STATISTIC(HexagonNumLoadAbsConversions,
+ "Number of Load instructions converted to absolute-set form");
+STATISTIC(HexagonNumStoreAbsConversions,
+ "Number of Store instructions converted to absolute-set form");
+
+namespace llvm {
+FunctionPass *createHexagonGenMemAbsolute();
+void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry);
+} // namespace llvm
+
+namespace {
+
+class HexagonGenMemAbsolute : public MachineFunctionPass {
+ const HexagonInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+
+public:
+ static char ID;
+ HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {
+ initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Hexagon Generate Load/Store Set Absolute Address Instruction";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ static bool isValidIndexedLoad(int &Opcode, int &NewOpcode);
+ static bool isValidIndexedStore(int &Opcode, int &NewOpcode);
+};
+} // namespace
+
+char HexagonGenMemAbsolute::ID = 0;
+
+INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute",
+ "Hexagon Generate Load/Store Set Absolute Address Instruction",
+ false, false)
+
+bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(Fn.getFunction()))
+ return false;
+
+ TII = Fn.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ MRI = &Fn.getRegInfo();
+ TRI = Fn.getRegInfo().getTargetRegisterInfo();
+
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+ // Loop over all of the basic blocks
+ for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+ MBBb != MBBe; ++MBBb) {
+ MachineBasicBlock *MBB = &*MBBb;
+ // Traverse the basic block
+ for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+ ++MII) {
+ MachineInstr *MI = &*MII;
+ int Opc = MI->getOpcode();
+ if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi)
+ continue;
+
+ const MachineOperand &MO = MI->getOperand(0);
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ unsigned DstReg = MO.getReg();
+ if (MRI->use_nodbg_empty(DstReg))
+ continue;
+
+ typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+ use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg);
+
+ MachineInstr *NextMI = NextUseMI->getParent();
+ int NextOpc = NextMI->getOpcode();
+ int NewOpc;
+ bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc);
+
+ if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc))
+ continue;
+
+ // Base and Offset positions for load and store instructions
+ // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm)
+ // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src)
+ unsigned BaseRegPos, ImmPos, RegPos;
+ if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos))
+ continue;
+ RegPos = IsLoad ? 0 : 2;
+
+ bool IsGlobal = MI->getOperand(1).isGlobal();
+ if (!MI->getOperand(1).isImm() && !IsGlobal)
+ continue;
+
+ const MachineOperand *BaseOp = nullptr;
+ int64_t Offset;
+ bool Scalable;
+ TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI);
+
+ // Ensure BaseOp is non-null and register type.
+ if (!BaseOp || !BaseOp->isReg())
+ continue;
+
+ if (Scalable)
+ continue;
+
+ unsigned BaseReg = BaseOp->getReg();
+ if ((DstReg != BaseReg) || (Offset != 0))
+ continue;
+
+ const MachineOperand &MO0 = NextMI->getOperand(RegPos);
+
+ if (!MO0.isReg())
+ continue;
+
+ unsigned LoadStoreReg = MO0.getReg();
+
+ // Store: Bail out if the src and base are same (def and use on same
+ // register).
+ if (LoadStoreReg == BaseReg)
+ continue;
+
+ // Insert the absolute-set instruction "I" only if the use of the
+ // BaseReg in "I" dominates the rest of the uses of BaseReg and if
+ // there are more than 2 uses of this BaseReg.
+ bool Dominates = true;
+ unsigned Counter = 0;
+ for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) {
+ Counter++;
+ if (!MDT.dominates(NextMI, I->getParent()))
+ Dominates = false;
+ }
+
+ if ((!Dominates) || (Counter < 3))
+ continue;
+
+ // If we reach here, we have met all the conditions required for the
+ // replacement of the absolute instruction.
+ LLVM_DEBUG({
+ dbgs() << "Found a pair of instructions for absolute-set "
+ << (IsLoad ? "load" : "store") << "\n";
+ dbgs() << *MI;
+ dbgs() << *NextMI;
+ });
+ MachineBasicBlock *ParentBlock = NextMI->getParent();
+ MachineInstrBuilder MIB;
+ if (IsLoad) { // Insert absolute-set load instruction
+ ++HexagonNumLoadAbsConversions;
+ MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+ TII->get(NewOpc), LoadStoreReg)
+ .addReg(DstReg, RegState::Define);
+ } else { // Insert absolute-set store instruction
+ ++HexagonNumStoreAbsConversions;
+ MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(),
+ TII->get(NewOpc), DstReg);
+ }
+
+ MachineOperand ImmOperand = MI->getOperand(1);
+ if (IsGlobal)
+ MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(),
+ ImmOperand.getTargetFlags());
+ else
+ MIB.addImm(ImmOperand.getImm());
+
+ if (IsLoad)
+ MIB->getOperand(0).setSubReg(MO0.getSubReg());
+ else
+ MIB.addReg(LoadStoreReg, 0, MO0.getSubReg());
+
+ LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n");
+ // Erase the instructions that got replaced.
+ MII = MBB->erase(MI);
+ --MII;
+ NextMI->getParent()->erase(NextMI);
+ }
+ }
+
+ return true;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) {
+
+ bool Result = true;
+ switch (Opc) {
+ case Hexagon::L2_loadrb_io:
+ NewOpc = Hexagon::L4_loadrb_ap;
+ break;
+ case Hexagon::L2_loadrh_io:
+ NewOpc = Hexagon::L4_loadrh_ap;
+ break;
+ case Hexagon::L2_loadri_io:
+ NewOpc = Hexagon::L4_loadri_ap;
+ break;
+ case Hexagon::L2_loadrd_io:
+ NewOpc = Hexagon::L4_loadrd_ap;
+ break;
+ case Hexagon::L2_loadruh_io:
+ NewOpc = Hexagon::L4_loadruh_ap;
+ break;
+ case Hexagon::L2_loadrub_io:
+ NewOpc = Hexagon::L4_loadrub_ap;
+ break;
+ default:
+ Result = false;
+ }
+
+ return Result;
+}
+
+bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) {
+
+ bool Result = true;
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ NewOpc = Hexagon::S4_storerd_ap;
+ break;
+ case Hexagon::S2_storeri_io:
+ NewOpc = Hexagon::S4_storeri_ap;
+ break;
+ case Hexagon::S2_storerh_io:
+ NewOpc = Hexagon::S4_storerh_ap;
+ break;
+ case Hexagon::S2_storerb_io:
+ NewOpc = Hexagon::S4_storerb_ap;
+ break;
+ default:
+ Result = false;
+ }
+
+ return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonGenMemAbsolute() {
+ return new HexagonGenMemAbsolute();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 619c7dc..91cc930 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
return getAddrMode(MI) == HexagonII::PostInc;
}
+bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
+ unsigned BasePos, OffsetPos;
+ if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return false;
+ return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
+}
+
// Returns true if an instruction is predicated irrespective of the predicate
// sense. For example, all of the following will return true.
// if (p0) R1 = add(R2, R3)
@@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
Opcode == Hexagon::J2_loop1rext;
}
+bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case Hexagon::L2_loadalignb_pci:
+ case Hexagon::L2_loadalignb_pcr:
+ case Hexagon::L2_loadalignh_pci:
+ case Hexagon::L2_loadalignh_pcr:
+ case Hexagon::L2_loadbsw2_pci:
+ case Hexagon::L2_loadbsw2_pcr:
+ case Hexagon::L2_loadbsw4_pci:
+ case Hexagon::L2_loadbsw4_pcr:
+ case Hexagon::L2_loadbzw2_pci:
+ case Hexagon::L2_loadbzw2_pcr:
+ case Hexagon::L2_loadbzw4_pci:
+ case Hexagon::L2_loadbzw4_pcr:
+ case Hexagon::L2_loadrb_pci:
+ case Hexagon::L2_loadrb_pcr:
+ case Hexagon::L2_loadrd_pci:
+ case Hexagon::L2_loadrd_pcr:
+ case Hexagon::L2_loadrh_pci:
+ case Hexagon::L2_loadrh_pcr:
+ case Hexagon::L2_loadri_pci:
+ case Hexagon::L2_loadri_pcr:
+ case Hexagon::L2_loadrub_pci:
+ case Hexagon::L2_loadrub_pcr:
+ case Hexagon::L2_loadruh_pci:
+ case Hexagon::L2_loadruh_pcr:
+ case Hexagon::S2_storerbnew_pci:
+ case Hexagon::S2_storerbnew_pcr:
+ case Hexagon::S2_storerb_pci:
+ case Hexagon::S2_storerb_pcr:
+ case Hexagon::S2_storerd_pci:
+ case Hexagon::S2_storerd_pcr:
+ case Hexagon::S2_storerf_pci:
+ case Hexagon::S2_storerf_pcr:
+ case Hexagon::S2_storerhnew_pci:
+ case Hexagon::S2_storerhnew_pcr:
+ case Hexagon::S2_storerh_pci:
+ case Hexagon::S2_storerh_pcr:
+ case Hexagon::S2_storerinew_pci:
+ case Hexagon::S2_storerinew_pcr:
+ case Hexagon::S2_storeri_pci:
+ case Hexagon::S2_storeri_pcr:
+ return true;
+ }
+ return false;
+}
+
bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index e496995..65783c5 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -434,6 +434,8 @@ public:
bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
bool PredOpcodeHasJMP_c(unsigned Opcode) const;
bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+ bool isPostIncWithImmOffset(const MachineInstr &MI) const;
+ bool isCircBufferInstr(const MachineInstr &MI) const;
unsigned getAddrMode(const MachineInstr &MI) const;
MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
new file mode 100644
index 0000000..4c845f2
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
@@ -0,0 +1,689 @@
+//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Convert post-inc addressing mode into base-offset addressing mode.
+// Ex:
+// original loop:
+// v1 = phi(v0, v3)
+// v2,v3 = post_load v1, 4
+
+// Often, unroller creates below form of post-increments:
+// v1 = phi(v0, v3')
+// v2,v3 = post_load v1, 4
+// v2',v3'= post_load v3, 4
+
+// This can be optimized in two ways
+
+// 1.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v3', -4
+//
+// 2.
+// v1 = phi(v0, v3')
+// v2,v3' = post_load v1, 8
+// v2' = load v1, 4
+//
+// Option 2 is favored as we can packetize two memory operations in a single
+// packet. However, this is not always favorable due to memory dependences
+// and in cases where we form a bigger chain of post-increment ops that will
+// create more spills as we can not execute post-increment ops with out
+// executing base-offset instructions.
+//===----------------------------------------------------------------------===//
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-postincopt"
+
+static cl::opt<unsigned> PostIncChainThreshold(
+ "post-inc-chain-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Limit the number of post-inc instructions in a chain."));
+
+static cl::opt<bool> PreferPostIncStore(
+ "prefer-post-inc-store", cl::Hidden, cl::init(true),
+ cl::desc("Prefer post-inc store in a list of loads and stores."));
+
+namespace llvm {
+void initializeHexagonPostIncOptPass(PassRegistry &);
+FunctionPass *createHexagonPostIncOpt();
+} // namespace llvm
+
+namespace {
+
+class HexagonPostIncOpt : public MachineFunctionPass {
+ MachineLoopInfo *MLI = nullptr;
+ const HexagonInstrInfo *HII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+ const HexagonSubtarget *HST = nullptr;
+
+public:
+ static char ID;
+
+ HexagonPostIncOpt() : MachineFunctionPass(ID) {
+ initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ bool translatePostIncsInLoop(MachineBasicBlock &MBB);
+ void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
+ void replacePostIncWithBaseOffset(MachineInstr &MI) const;
+ bool isPostIncInsn(MachineInstr &MI) const;
+ void foldAdds(MachineBasicBlock &MBB) const;
+ void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
+ void removeDeadInstructions(MachineBasicBlock &MBB) const;
+
+ void generatePostInc(MachineBasicBlock &MBB);
+ bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+ void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+
+ bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
+ bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
+};
+
+class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
+ HexagonPostIncOpt &Pass;
+
+public:
+ HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
+ MachineLoopInfo *MLI)
+ : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
+ void schedule() override;
+ ScheduleDAGTopologicalSort &getTopo() { return Topo; };
+};
+
+} // End anonymous namespace.
+
+char HexagonPostIncOpt::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
+ "Hexagon Post-Inc-Opt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
+ false, false)
+
+/// Return true if MIA dominates MIB.
+static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
+ if (MIA->getParent() != MIB->getParent())
+ return false; // Don't know since machine dominator tree is out of date.
+
+ MachineBasicBlock *MBB = MIA->getParent();
+ MachineBasicBlock::iterator I = MBB->instr_begin();
+ // Iterate over the basic block until MIA or MIB is found.
+ for (; &*I != MIA && &*I != MIB; ++I)
+ ;
+
+ // MIA dominates MIB if MIA is found first.
+ return &*I == MIA;
+}
+
+// Return the Phi register value that comes from the loop block.
+static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+ return Phi->getOperand(i).getReg();
+ return UINT_MAX;
+}
+
+static bool isAddWithImmValue(const MachineInstr &MI) {
+ // FIXME: For now, only deal with adds that have strict immediate values.
+ // Some A2_addi instructions can be of the form.
+ // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
+ return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
+}
+
+// Compute the number of 'real' instructions in the basic block by
+// ignoring terminators.
+static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
+ unsigned size = 0;
+ for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
+ if (!I.isDebugInstr())
+ size++;
+ return size;
+}
+
+// Setup Post increment Schedule DAG.
+static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
+ MachineBasicBlock &MBB) {
+ PIDAG.startBlock(&MBB);
+ PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
+ getBasicBlockSize(MBB));
+ // Build the graph.
+ PIDAG.schedule();
+ // exitRegion() is an empty function in base class. So, safe to call it here.
+ PIDAG.exitRegion();
+}
+
+// Check if post-increment candidate has any memory dependence on any
+// instruction in the chain.
+static bool hasMemoryDependency(SUnit *PostIncSU,
+ SmallVector<MachineInstr *, 4> &UseList) {
+
+ // FIXME: Fine tune the order dependence. Probably can only consider memory
+ // related OrderKind.
+ for (auto &Dep : PostIncSU->Succs)
+ if (Dep.getKind() == SDep::Order)
+ if (std::find(UseList.begin(), UseList.end(),
+ Dep.getSUnit()->getInstr()) != UseList.end())
+ return true;
+
+ return false;
+}
+
+// Fold an add with immediate into either an add or a load or a store.
+void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
+ LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
+ for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+ if (!isAddWithImmValue(MI))
+ continue;
+ unsigned DefReg = MI.getOperand(0).getReg();
+ unsigned AddReg = MI.getOperand(1).getReg();
+ int64_t AddImm = MI.getOperand(2).getImm();
+
+ SmallVector<MachineInstr *, 4> UseList;
+ // Gather the uses of add instruction's def reg.
+ for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
+ MachineInstr *UseMI = MO.getParent();
+ // Deal with only the instuctions that belong to this block.
+ // If we cross this block, the generation of post-increment logic
+ // will not be able to transform to post-inc due to dominance.
+ if (UseMI->getParent() == &MBB)
+ UseList.push_back(UseMI);
+ }
+
+ if (UseList.empty())
+ continue;
+
+ LLVM_DEBUG({
+ dbgs() << "Current instruction considered for folding \n";
+ MI.dump();
+ });
+
+ for (auto UseMI : UseList) {
+ if (isAddWithImmValue(*UseMI)) {
+ int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
+ // Fold if the new immediate is with in the range.
+ if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is folded in to \n";
+ });
+ UseMI->getOperand(1).setReg(AddReg);
+ UseMI->getOperand(2).setImm(NewImm);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ } else if (HII->isBaseImmOffset(*UseMI)) {
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is folded in to \n";
+ });
+ updateBaseAndOffset(*UseMI, MI);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ removeDeadInstructions(MBB);
+ LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
+}
+
+void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
+ MachineInstr &AddMI) const {
+ assert(HII->isBaseImmOffset(MI));
+ unsigned BasePos, OffsetPos;
+ if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return;
+
+ MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
+ MachineOperand &BaseOp = MI.getOperand(BasePos);
+
+ if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
+ return;
+
+ unsigned IncBase = AddMI.getOperand(1).getReg();
+ int64_t IncValue = AddMI.getOperand(2).getImm();
+
+ int64_t NewOffset = OffsetOp.getImm() + IncValue;
+ if (!isValidOffset(MI, NewOffset))
+ return;
+
+ OffsetOp.setImm(NewOffset);
+ BaseOp.setReg(IncBase);
+}
+
+void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
+ // For MBB, check that the value defined by each instruction is used.
+ // If not, delete it.
+ for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
+ ME = MBB.instr_rend();
+ MI != ME;) {
+ // From DeadMachineInstructionElem. Don't delete inline assembly.
+ if (MI->isInlineAsm()) {
+ ++MI;
+ continue;
+ }
+ bool SawStore = false;
+ // Check if it's safe to remove the instruction due to side effects.
+ if (!MI->isSafeToMove(nullptr, SawStore)) {
+ ++MI;
+ continue;
+ }
+ unsigned Uses = 0;
+ for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+ MOE = MI->operands_end();
+ MOI != MOE; ++MOI) {
+ if (!MOI->isReg() || !MOI->isDef())
+ continue;
+ unsigned reg = MOI->getReg();
+ // Assume physical registers are used.
+ if (Register::isPhysicalRegister(reg)) {
+ Uses++;
+ continue;
+ }
+ if (MRI->use_begin(reg) != MRI->use_end())
+ Uses++;
+ }
+ if (!Uses) {
+ MI++->eraseFromParent();
+ continue;
+ }
+ ++MI;
+ }
+}
+
+bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
+ // Predicated post-increments are not yet handled. (ISel is not generating
+ // them yet). Circular buffer instructions should not be handled.
+ return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
+ !HII->isCircBufferInstr(MI));
+}
+
+/// For instructions with a base and offset, return true if the new Offset
+/// is a valid value with the correct alignment.
+bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
+ int64_t Offset) const {
+ if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
+ return false;
+ unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+ return (Offset & AlignMask) == 0;
+}
+
+bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
+ int IncVal) const {
+ unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+ if ((IncVal & AlignMask) != 0)
+ return false;
+
+ // Number of total bits in the instruction used to encode Inc value.
+ unsigned IncBits = 4;
+ // For HVX instructions, the offset is 3.
+ if (HexagonII::isCVI(MI.getDesc()))
+ IncBits = 3;
+
+ IncBits += Log2_32(HII->getMemAccessSize(MI));
+ if (HII->getMemAccessSize(MI) > 8)
+ IncBits = 16;
+
+ int MinValidVal = -1U << (IncBits - 1);
+ int MaxValidVal = ~(-1U << (IncBits - 1));
+ return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
+}
+
+void HexagonPostIncOptSchedDAG::schedule() {
+ AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+ buildSchedGraph(AA);
+}
+
+// Replace post-increment operations with base+offset counterpart.
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(
+ MachineBasicBlock &MBB) const {
+ LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
+ "base+offset counterparts.\n");
+
+ SmallVector<MachineInstr *, 4> MIList;
+ for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+ // Check for eligible post-inc candidates.
+ if (!isPostIncInsn(MI))
+ continue;
+ MIList.push_back(&MI);
+ }
+
+ for (auto MI : MIList)
+ replacePostIncWithBaseOffset(*MI);
+
+ LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
+}
+
+void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
+ short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
+ if (NewOpcode < 0)
+ return;
+
+ unsigned BasePos = 0, OffsetPos = 0;
+ if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+ return;
+ const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
+ const MachineOperand &PostIncBase = MI.getOperand(BasePos);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineOperand *PostIncDest;
+ MachineInstrBuilder MIB;
+ if (MI.mayLoad()) {
+ PostIncDest = &MI.getOperand(1);
+ const MachineOperand &LDValue = MI.getOperand(0);
+ MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+ MIB.add(LDValue).add(PostIncBase).addImm(0);
+ } else {
+ PostIncDest = &MI.getOperand(0);
+ const MachineOperand &STValue = MI.getOperand(3);
+ MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+ MIB.add(PostIncBase).addImm(0).add(STValue);
+ }
+
+ // Transfer memoperands.
+ MIB->cloneMemRefs(*MBB.getParent(), MI);
+
+ // Create an add instruction for the post-inc addition of offset.
+ MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
+ MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
+
+ LLVM_DEBUG({
+ dbgs() << "\n";
+ MI.dump();
+ dbgs() << "\tis tranformed to \n";
+ MIB->dump();
+ MIBA->dump();
+ dbgs() << "\n\n";
+ });
+
+ MI.eraseFromParent();
+}
+
+void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
+ LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
+ MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
+ MachineBasicBlock::iterator MIE = MBB.instr_begin();
+ bool isOK = true;
+ while (MII != MIE) {
+ MachineInstr *Phi = &*std::prev(MII);
+ MII = std::prev(MII);
+ unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
+ if (LoopVal == UINT_MAX)
+ continue;
+ MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
+ if (!isAddWithImmValue(*LoopInst))
+ continue;
+
+ if (LoopInst->getOpcode() != Hexagon::A2_addi)
+ continue;
+
+ unsigned AddReg = LoopInst->getOperand(1).getReg();
+ int64_t AddImm = LoopInst->getOperand(2).getImm();
+ SmallVector<MachineInstr *, 4> UseList;
+ MachineInstr *PostIncCandidate = nullptr;
+
+ // Find the probable candidates for Post-increment instruction.
+ SmallVector<MachineInstr *, 4> CandList;
+ for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
+ MachineInstr *UseMI = MO.getParent();
+
+ if (UseMI == LoopInst)
+ continue;
+
+ if (!dominates(UseMI, LoopInst)) {
+ isOK = false;
+ break;
+ }
+ const MachineOperand *BaseOp = nullptr;
+ int64_t Offset;
+ bool OffsetIsScalable;
+ if (!HII->isBaseImmOffset(*UseMI) ||
+ !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
+ OffsetIsScalable, TRI)) {
+ isOK = false;
+ break;
+ }
+ int64_t NewOffset = Offset - AddImm;
+ if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
+ BaseOp->getReg() != AddReg) {
+ isOK = false;
+ break;
+ }
+ if (OffsetIsScalable) {
+ isOK = false;
+ break;
+ }
+ if (Offset == 0) {
+ // If you have stores in the chain, make sure they are in the beginning
+ // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
+ // ST.
+ if (UseMI->mayStore() && PreferPostIncStore)
+ CandList.insert(CandList.begin(), UseMI);
+ else
+ CandList.push_back(UseMI);
+ continue;
+ }
+ UseList.push_back(UseMI);
+ }
+
+ if (!isOK)
+ continue;
+
+ for (auto MI : CandList) {
+ if (!PostIncCandidate)
+ PostIncCandidate = MI;
+ // Push the rest of the list for updation.
+ else
+ UseList.push_back(MI);
+ }
+
+ // If a candidate is found, replace it with the post-inc instruction.
+ // Also, adjust offset for other uses as needed.
+ if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
+ continue;
+
+ // Logic to determine what the base register to be.
+ // There are two choices:
+ // 1. New address register after we updated the post-increment candidate.
+ // v2,v3 = post_load v1, 4
+ // v3 is the choice here.
+ // 2. The base register we used in post-increment candidate.
+ // v2,v3 = post_load v1, 4
+ // v1 is the choice here.
+ // Use v3 if there is a memory dependence between post-inc instruction and
+ // any other instruction in the chain.
+ // FIXME: We can do some complex DAG analysis based off height and depth and
+ // selectively update other instructions in the chain. Use v3 if there are
+ // more instructions in the chain, otherwise we will end up increasing the
+ // height of the DAG resulting in more spills. By default we have a
+ // threshold controlled by the option "post-inc-chain-threshold" which is
+ // set to 4. v1 is preferred as we can packetize two memory operations in a
+ // single packet in scalar core. But it heavily depends on the structure of
+ // DAG.
+ bool UpdateBaseToNew = false;
+
+ // Do not bother to build a DAG and analyze if the Use list is empty.
+ if (!UseList.empty()) {
+ MachineFunction *MF = MBB.getParent();
+ // Setup the Post-inc schedule DAG.
+ HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
+ initPISchedDAG(PIDAG, MBB);
+ SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
+ if (hasMemoryDependency(SU, UseList) ||
+ UseList.size() >= PostIncChainThreshold)
+ UpdateBaseToNew = true;
+ }
+
+ if (UpdateBaseToNew) {
+ LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
+ "base register of post-increment\n");
+ for (auto UseMI : UseList) {
+ if (!dominates(PostIncCandidate, UseMI))
+ continue;
+ unsigned BasePos, OffsetPos;
+ if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
+ // New offset has already been validated; no need to do it again.
+ LLVM_DEBUG({
+ UseMI->dump();
+ dbgs() << "\t is transformed to \n";
+ });
+ int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
+ UseMI->getOperand(OffsetPos).setImm(NewOffset);
+ UseMI->getOperand(BasePos).setReg(LoopVal);
+ LLVM_DEBUG(UseMI->dump());
+ }
+ }
+ }
+ replaceWithPostInc(PostIncCandidate, LoopInst);
+ }
+ LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
+}
+
+bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
+ MachineInstr *AddMI) const {
+ if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
+ return false;
+ assert(AddMI->getOpcode() == Hexagon::A2_addi);
+ return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
+}
+
+void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
+ MachineInstr *AddMI) const {
+ short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
+ assert(NewOpcode >= 0 &&
+ "Couldn't change base offset to post-increment form");
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ const MachineOperand &IncDest = AddMI->getOperand(0);
+ const MachineOperand &IncBase = AddMI->getOperand(1);
+ const MachineOperand &IncValue = AddMI->getOperand(2);
+ MachineInstrBuilder MIB;
+ LLVM_DEBUG({
+ dbgs() << "\n\n";
+ MI->dump();
+ dbgs() << "\t is tranformed to post-inc form of \n";
+ });
+
+ if (MI->mayLoad()) {
+ const MachineOperand &LDValue = MI->getOperand(0);
+ MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+ MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
+ } else {
+ const MachineOperand &STValue = MI->getOperand(2);
+ MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+ MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
+ }
+
+ // Transfer memoperands.
+ MIB->cloneMemRefs(*MBB.getParent(), *MI);
+
+ LLVM_DEBUG({
+ MIB->dump();
+ dbgs() << "As a result this add instruction is erased.\n";
+ AddMI->dump();
+ });
+
+ MI->eraseFromParent();
+ AddMI->eraseFromParent();
+}
+
+bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
+ // Algorithm:
+ // 1. Replace all the post-inc instructions with Base+Offset instruction and
+ // an add instruction in this block.
+ // 2. Fold all the adds in to respective uses.
+ // 3. Generate post-increment instructions and update the uses of the base
+ // register if needed based on constraints.
+
+ replacePostIncWithBaseOffset(MBB);
+ foldAdds(MBB);
+ generatePostInc(MBB);
+ return true;
+}
+
+bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
+
+ // Skip pass if requested.
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ // Get Target Information.
+ MLI = &getAnalysis<MachineLoopInfo>();
+ HST = &MF.getSubtarget<HexagonSubtarget>();
+ TRI = HST->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ HII = HST->getInstrInfo();
+
+ // Skip this pass for TinyCore.
+ // Tiny core allwos partial post increment operations - This constraint can
+ // be imposed inside the pass. In a chain of post-increments, the first can
+ // be post-increment, rest can be adjusted to base+offset (these are
+ // inexpensive in most of the cases);
+ if (HST->isTinyCore())
+ return false;
+
+ LLVM_DEBUG({
+ dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
+ dbgs() << "Function: " << MF.getName() << "\n";
+ });
+ bool Change = false;
+ std::vector<MachineBasicBlock *> MLBB;
+ for (auto &BB : MF) {
+ // Check if this Basic Block belongs to any loop.
+ auto *LI = MLI->getLoopFor(&BB);
+ // We only deal with inner-most loops that has one block.
+ if (LI && LI->getBlocks().size() == 1) {
+ MachineBasicBlock *MBB = LI->getHeader();
+ // Do not traverse blocks that are already visited.
+ if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
+ continue;
+
+ MLBB.push_back(MBB);
+
+ LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
+ Change |= translatePostIncsInLoop(*MBB);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
+ return Change;
+}
+
+FunctionPass *llvm::createHexagonPostIncOpt() {
+ return new HexagonPostIncOpt();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d4b420..a5ebd64 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
cl::init(true), cl::Hidden,
cl::desc("Early expansion of MUX"));
+static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
+ cl::Hidden,
+ cl::desc("Cleanup of TFRs/COPYs"));
+
static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
cl::desc("Enable early if-conversion"));
@@ -92,6 +96,10 @@ static cl::opt<bool>
static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
cl::desc("Disable splitting double registers"));
+static cl::opt<bool>
+ EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden,
+ cl::desc("Generate absolute set instructions"));
+
static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
cl::Hidden, cl::desc("Bit simplification"));
@@ -121,6 +129,10 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
cl::init(true),
cl::desc("Enable instsimplify"));
+static cl::opt<bool> DisableHexagonPostIncOpt(
+ "hexagon-postinc-opt", cl::Hidden,
+ cl::desc("Disable Hexagon post-increment optimization"));
+
/// HexagonTargetMachineModule - Note that this is used on hosts that
/// cannot link in a library unless there are references into the
/// library. In particular, it seems that it is not possible to get
@@ -145,20 +157,24 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
namespace llvm {
extern char &HexagonExpandCondsetsID;
+ extern char &HexagonTfrCleanupID;
void initializeHexagonBitSimplifyPass(PassRegistry&);
void initializeHexagonConstExtendersPass(PassRegistry&);
void initializeHexagonConstPropagationPass(PassRegistry&);
void initializeHexagonCopyToCombinePass(PassRegistry&);
void initializeHexagonEarlyIfConversionPass(PassRegistry&);
void initializeHexagonExpandCondsetsPass(PassRegistry&);
+ void initializeHexagonGenMemAbsolutePass(PassRegistry &);
void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonHardwareLoopsPass(PassRegistry&);
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
void initializeHexagonNewValueJumpPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
+ void initializeHexagonPostIncOptPass(PassRegistry &);
void initializeHexagonRDFOptPass(PassRegistry&);
void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+ void initializeHexagonTfrCleanupPass(PassRegistry &);
void initializeHexagonVExtractPass(PassRegistry &);
void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
@@ -177,6 +193,7 @@ namespace llvm {
FunctionPass *createHexagonFixupHwLoops();
FunctionPass *createHexagonGenExtract();
FunctionPass *createHexagonGenInsert();
+ FunctionPass *createHexagonGenMemAbsolute();
FunctionPass *createHexagonGenMux();
FunctionPass *createHexagonGenPredicate();
FunctionPass *createHexagonHardwareLoops();
@@ -188,10 +205,12 @@ namespace llvm {
FunctionPass *createHexagonOptimizeSZextends();
FunctionPass *createHexagonPacketizer(bool Minimal);
FunctionPass *createHexagonPeephole();
+ FunctionPass *createHexagonPostIncOpt();
FunctionPass *createHexagonRDFOpt();
FunctionPass *createHexagonSplitConst32AndConst64();
FunctionPass *createHexagonSplitDoubleRegs();
FunctionPass *createHexagonStoreWidening();
+ FunctionPass *createHexagonTfrCleanup();
FunctionPass *createHexagonVectorCombineLegacyPass();
FunctionPass *createHexagonVectorPrint();
FunctionPass *createHexagonVExtract();
@@ -211,12 +230,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
initializeHexagonConstPropagationPass(PR);
initializeHexagonCopyToCombinePass(PR);
initializeHexagonEarlyIfConversionPass(PR);
+ initializeHexagonGenMemAbsolutePass(PR);
initializeHexagonGenMuxPass(PR);
initializeHexagonHardwareLoopsPass(PR);
initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
initializeHexagonNewValueJumpPass(PR);
initializeHexagonOptAddrModePass(PR);
initializeHexagonPacketizerPass(PR);
+ initializeHexagonPostIncOptPass(PR);
initializeHexagonRDFOptPass(PR);
initializeHexagonSplitDoubleRegsPass(PR);
initializeHexagonVectorCombineLegacyPass(PR);
@@ -244,6 +265,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
(HexagonNoOpt ? CodeGenOptLevel::None : OL)),
TLOF(std::make_unique<HexagonTargetObjectFile>()) {
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+ initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
+ initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
@@ -411,11 +434,20 @@ void HexagonPassConfig::addPreRegAlloc() {
addPass(createHexagonConstExtenders());
if (EnableExpandCondsets)
insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+ if (EnableTfrCleanup)
+ insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
if (!DisableStoreWidening)
addPass(createHexagonStoreWidening());
+ if (EnableGenMemAbs)
+ addPass(createHexagonGenMemAbsolute());
if (!DisableHardwareLoops)
addPass(createHexagonHardwareLoops());
}
+
+ if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
+ if (!DisableHexagonPostIncOpt)
+ addPass(createHexagonPostIncOpt());
+
if (TM->getOptLevel() >= CodeGenOptLevel::Default)
addPass(&MachinePipelinerID);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
new file mode 100644
index 0000000..a4b359a
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -0,0 +1,324 @@
+//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is to address a situation that appears after register allocaion
+// evey now and then, namely a register copy from a source that was defined
+// as an immediate value in the same block (usually just before the copy).
+//
+// Here is an example of actual code emitted that shows this problem:
+//
+// .LBB0_5:
+// {
+// r5 = zxtb(r8)
+// r6 = or(r6, ##12345)
+// }
+// {
+// r3 = xor(r1, r2)
+// r1 = #0 <-- r1 set to #0
+// }
+// {
+// r7 = r1 <-- r7 set to r1
+// r0 = zxtb(r3)
+// }
+
+#define DEBUG_TYPE "tfr-cleanup"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createHexagonTfrCleanup();
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+class HexagonTfrCleanup : public MachineFunctionPass {
+public:
+ static char ID;
+ HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {
+ PassRegistry &R = *PassRegistry::getPassRegistry();
+ initializeHexagonTfrCleanupPass(R);
+ }
+ StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ const HexagonInstrInfo *HII;
+ const TargetRegisterInfo *TRI;
+
+ typedef DenseMap<unsigned, uint64_t> ImmediateMap;
+
+ bool isIntReg(unsigned Reg, bool &Is32);
+ void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap);
+ bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap);
+ bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap);
+ bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes);
+ bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes);
+};
+} // namespace
+
+char HexagonTfrCleanup::ID = 0;
+
+namespace llvm {
+char &HexagonTfrCleanupID = HexagonTfrCleanup::ID;
+}
+
+bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) {
+ Is32 = Hexagon::IntRegsRegClass.contains(Reg);
+ return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg);
+}
+
+// Assign given value V32 to the specified the register R32 in the map. Only
+// 32-bit registers are valid arguments.
+void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) {
+ ImmediateMap::iterator F = IMap.find(R32);
+ if (F == IMap.end())
+ IMap.insert(std::make_pair(R32, V32));
+ else
+ F->second = V32;
+}
+
+// Retrieve a value of the provided register Reg and store it into Val.
+// Return "true" if a value was found, "false" otherwise.
+bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val,
+ ImmediateMap &IMap) {
+ bool Is32;
+ if (!isIntReg(Reg, Is32))
+ return false;
+
+ if (Is32) {
+ ImmediateMap::iterator F = IMap.find(Reg);
+ if (F == IMap.end())
+ return false;
+ Val = F->second;
+ return true;
+ }
+
+ // For 64-bit registers, compose the value from the values of its
+ // subregisters.
+ unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo);
+ unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi);
+ ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH);
+ if (FL == IMap.end() || FH == IMap.end())
+ return false;
+ Val = (FH->second << 32) | FL->second;
+ return true;
+}
+
+// Process an instruction and record the relevant information in the imme-
+// diate map.
+bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) {
+ using namespace Hexagon;
+
+ if (MI->isCall()) {
+ IMap.clear();
+ return true;
+ }
+
+ // If this is an instruction that loads a constant into a register,
+ // record this information in IMap.
+ unsigned Opc = MI->getOpcode();
+ if (Opc == A2_tfrsi || Opc == A2_tfrpi) {
+ unsigned DefR = MI->getOperand(0).getReg();
+ bool Is32;
+ if (!isIntReg(DefR, Is32))
+ return false;
+ if (!MI->getOperand(1).isImm()) {
+ if (!Is32) {
+ IMap.erase(TRI->getSubReg(DefR, isub_lo));
+ IMap.erase(TRI->getSubReg(DefR, isub_hi));
+ } else {
+ IMap.erase(DefR);
+ }
+ return false;
+ }
+ uint64_t Val = MI->getOperand(1).getImm();
+ // If it's a 64-bit register, break it up into subregisters.
+ if (!Is32) {
+ uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU);
+ setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap);
+ setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap);
+ } else {
+ setReg(DefR, Val, IMap);
+ }
+ return true;
+ }
+
+ // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap.
+ for (MachineInstr::mop_iterator Mo = MI->operands_begin(),
+ E = MI->operands_end();
+ Mo != E; ++Mo) {
+ if (Mo->isRegMask()) {
+ IMap.clear();
+ return true;
+ }
+ if (!Mo->isReg() || !Mo->isDef())
+ continue;
+ unsigned R = Mo->getReg();
+ for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) {
+ ImmediateMap::iterator F = IMap.find(*AR);
+ if (F != IMap.end())
+ IMap.erase(F);
+ }
+ }
+ return true;
+}
+
+// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that
+// has a known constant value.
+bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap,
+ SlotIndexes *Indexes) {
+ using namespace Hexagon;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case A2_tfr:
+ case A2_tfrp:
+ case COPY:
+ break;
+ default:
+ return false;
+ }
+
+ unsigned DstR = MI->getOperand(0).getReg();
+ unsigned SrcR = MI->getOperand(1).getReg();
+ bool Tmp, Is32;
+ if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp))
+ return false;
+ assert(Tmp == Is32 && "Register size mismatch");
+ uint64_t Val;
+ bool Found = getReg(SrcR, Val, IMap);
+ if (!Found)
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ int64_t SVal = Is32 ? int32_t(Val) : Val;
+ auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+ MachineInstr *NewMI;
+ if (Is32)
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal);
+ else if (isInt<8>(SVal))
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal);
+ else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL)))
+ NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR)
+ .addImm(int32_t(SVal >> 32))
+ .addImm(int32_t(Val & 0xFFFFFFFFLL));
+ else if (HST.isTinyCore())
+ // Disable generating CONST64 since it requires load resource.
+ return false;
+ else
+ NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val);
+
+ // Replace the MI to reuse the same slot index
+ if (Indexes)
+ Indexes->replaceMachineInstrInMaps(*MI, *NewMI);
+ MI->eraseFromParent();
+ return true;
+}
+
+// Remove the instruction if it is a self-assignment.
+bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI,
+ SlotIndexes *Indexes) {
+ unsigned Opc = MI->getOpcode();
+ unsigned DefR, SrcR;
+ bool IsUndef = false;
+ switch (Opc) {
+ case Hexagon::A2_tfr:
+ // Rd = Rd
+ DefR = MI->getOperand(0).getReg();
+ SrcR = MI->getOperand(1).getReg();
+ IsUndef = MI->getOperand(1).isUndef();
+ break;
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ // if ([!]Pu) Rd = Rd
+ DefR = MI->getOperand(0).getReg();
+ SrcR = MI->getOperand(2).getReg();
+ IsUndef = MI->getOperand(2).isUndef();
+ break;
+ default:
+ return false;
+ }
+ if (DefR != SrcR)
+ return false;
+ if (IsUndef) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR);
+ for (auto &Op : MI->operands())
+ if (Op.isReg() && Op.isDef() && Op.isImplicit())
+ DefI->addOperand(Op);
+ }
+
+ if (Indexes)
+ Indexes->removeMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ return true;
+}
+
+bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ // Map: 32-bit register -> immediate value.
+ // 64-bit registers are stored through their subregisters.
+ ImmediateMap IMap;
+ SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>();
+
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ HII = HST.getInstrInfo();
+ TRI = HST.getRegisterInfo();
+
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ MachineBasicBlock &B = *I;
+ MachineBasicBlock::iterator J, F, NextJ;
+ IMap.clear();
+ bool Inserted = false, Erased = false;
+ for (J = B.begin(), F = B.end(); J != F; J = NextJ) {
+ NextJ = std::next(J);
+ MachineInstr *MI = &*J;
+ bool E = eraseIfRedundant(MI, Indexes);
+ Erased |= E;
+ if (E)
+ continue;
+ Inserted |= rewriteIfImm(MI, IMap, Indexes);
+ MachineBasicBlock::iterator NewJ = std::prev(NextJ);
+ updateImmMap(&*NewJ, IMap);
+ }
+ bool BlockC = Inserted | Erased;
+ Changed |= BlockC;
+ if (BlockC && Indexes)
+ Indexes->repairIndexesInRange(&B, B.begin(), B.end());
+ }
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false,
+ false)
+
+FunctionPass *llvm::createHexagonTfrCleanup() {
+ return new HexagonTfrCleanup();
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca98269..9840412 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -18,6 +18,7 @@
#include "HexagonDepITypes.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
namespace llvm {
@@ -48,7 +49,7 @@ namespace HexagonII {
// MCInstrDesc TSFlags
// *** Must match HexagonInstrFormat*.td ***
- enum {
+ enum HexagonTSFlagsVal {
// This 7-bit field describes the insn type.
TypePos = 0,
TypeMask = 0x7f,
@@ -173,6 +174,11 @@ namespace HexagonII {
hasUnaryRestrictionMask = 0x1,
};
+ inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
+ unsigned Mask) {
+ return (MID.TSFlags >> Pos) & Mask;
+ }
+
// *** The code above must match HexagonInstrFormat*.td *** //
// Hexagon specific MO operand flag mask.
@@ -275,6 +281,10 @@ namespace HexagonII {
INST_ICLASS_ALU32_3 = 0xf0000000
};
+ inline bool isCVI(const MCInstrDesc &MID) {
+ return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
+ }
+
LLVM_ATTRIBUTE_UNUSED
static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
switch (S) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f25..3ff8994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+ if (Opcode == NVPTX::StoreRetvalI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
NVPTX::StoreParamI8, NVPTX::StoreParamI16,
NVPTX::StoreParamI32, NVPTX::StoreParamI64,
NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+ if (Opcode == NVPTX::StoreParamI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78..66a1010 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
#include <cmath>
#include <cstdint>
#include <iterator>
+#include <optional>
#include <sstream>
#include <string>
#include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
+static bool adjustElementType(EVT &ElementType) {
+ switch (ElementType.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::f16:
+ case MVT::bf16:
+ ElementType = MVT::i16;
+ return true;
+ case MVT::f32:
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ ElementType = MVT::i32;
+ return true;
+ case MVT::f64:
+ ElementType = MVT::i64;
+ return true;
+ }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue StVal, SDValue &InGlue,
+ unsigned ArgID, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+ // Store each byte
+ SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal, InGlue};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(
+ NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+ MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+ InGlue = Chain.getValue(1);
+ }
+ return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+ EVT ElementType, SDValue &InGlue,
+ SmallVectorImpl<SDValue> &TempProxyRegOps,
+ const SDLoc &dl) {
+ // Bit logic only works on integer types
+ EVT MergedType = ElementType;
+ adjustElementType(MergedType);
+
+ // Load each byte and construct the whole value. Initial value to 0
+ SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+ // LoadParamMemI8 loads into i16 register only
+ SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ InGlue};
+ // This will be selected to LoadParamMemI8
+ SDValue LdVal =
+ DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+ MVT::i8, MachinePointerInfo(), Align(1));
+ SDValue TmpLdVal = LdVal.getValue(0);
+ Chain = LdVal.getValue(1);
+ InGlue = LdVal.getValue(2);
+
+ TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+ TmpLdVal.getSimpleValueType(), TmpLdVal);
+ TempProxyRegOps.push_back(TmpLdVal);
+
+ SDValue CMask = DAG.getConstant(255, dl, MergedType);
+ SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+ // Need to extend the i16 register to the whole width.
+ TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+ // Mask off the high bits. Leave only the lower 8bits.
+ // Do this because we are using loadparam.b8.
+ TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+ // Shift and merge
+ TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+ RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+ }
+ if (ElementType != MergedType)
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ return RetVal;
+}
+
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (NeedAlign)
PartAlign = commonAlignment(ArgAlign, CurOffset);
- // New store.
- if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceding store.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(
- DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
- StoreOperands.push_back(DAG.getConstant(
- IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
- dl, MVT::i32));
- }
-
SDValue StVal = OutVals[OIdx];
MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+ PartAlign.value() <
+ DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+ assert(StoreOperands.empty() && "Unfinished preceeding store.");
+ Chain = LowerUnalignedStoreParam(
+ DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+ StVal, InGlue, ParamCount, dl);
+
+ // LowerUnalignedStoreParam took care of inserting the necessary nodes
+ // into the SDAG, so just move on to the next element.
+ if (!IsByVal)
+ ++OIdx;
+ continue;
+ }
+
+ // New store.
+ if (VectorInfo[j] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Unfinished preceding store.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(
+ DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+ StoreOperands.push_back(DAG.getConstant(
+ IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+ dl, MVT::i32));
+ }
+
// Record the value to store.
StoreOperands.push_back(StVal);
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 16> ProxyRegOps;
SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+ // An item of the vector is filled if the element does not need a ProxyReg
+ // operation on it and should be added to InVals as is. ProxyRegOps and
+ // ProxyRegTruncates contain empty/none items at the same index.
+ SmallVector<SDValue, 16> RetElts;
+ // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+ // to use the values of `LoadParam`s and to be replaced later then
+ // `CALLSEQ_END` is added.
+ SmallVector<SDValue, 16> TempProxyRegOps;
// Generate loads from param memory/moves from registers for result
if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
EltType = MVT::i16;
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar load. In such cases, fall back to byte loads.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+ EltAlign < DL.getABITypeAlign(
+ TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+ assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+ SDValue Ret = LowerUnalignedLoadRetParam(
+ DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+ ProxyRegOps.push_back(SDValue());
+ ProxyRegTruncates.push_back(std::optional<MVT>());
+ RetElts.resize(i);
+ RetElts.push_back(Ret);
+
+ continue;
+ }
+
// Record index of the very first element of the vector.
if (VectorInfo[i] & PVF_FIRST) {
assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+ if (i < RetElts.size() && RetElts[i]) {
+ InVals.push_back(RetElts[i]);
+ continue;
+ }
+
SDValue Ret = DAG.getNode(
NVPTXISD::ProxyReg, dl,
DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InVals.push_back(Ret);
}
+ for (SDValue &T : TempProxyRegOps) {
+ SDValue Repl = DAG.getNode(
+ NVPTXISD::ProxyReg, dl,
+ DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+ {Chain, T.getOperand(0), InGlue});
+ DAG.ReplaceAllUsesWith(T, Repl);
+ DAG.RemoveDeadNode(T.getNode());
+
+ Chain = Repl.getValue(1);
+ InGlue = Repl.getValue(2);
+ }
+
// set isTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
Value *srcValue = Constant::getNullValue(PointerType::get(
EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+ const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+ if (aggregateIsPacked)
+ return Align(1);
+ if (NumElts != 1)
+ return std::nullopt;
+ Align PartAlign =
+ (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+ ? PAL.getParamAlignment(i).value()
+ : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+ return commonAlignment(PartAlign, Offsets[parti]);
+ }();
SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
- MachinePointerInfo(srcValue),
- MaybeAlign(aggregateIsPacked ? 1 : 0),
+ MachinePointerInfo(srcValue), PartAlign,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
return Chain;
}
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue RetVal, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ // Store each byte
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), StoreOperands,
+ MVT::i8, MachinePointerInfo(), std::nullopt,
+ MachineMemOperand::MOStore);
+ }
+ return Chain;
+}
+
SDValue
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 6> StoreOperands;
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- // New load/store. Record chain and offset operands.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Orphaned operand list.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
- }
-
SDValue OutVal = OutVals[i];
SDValue RetVal = PromotedOutVals[i];
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
}
+ // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+ // for a scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+ EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+ Align ElementTypeAlign =
+ DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+ Align ElementAlign =
+ commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+ if (ElementAlign < ElementTypeAlign) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+ RetVal, dl);
+
+ // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+ // into the graph, so just move on to the next element.
+ continue;
+ }
+ }
+
+ // New load/store. Record chain and offset operands.
+ if (VectorInfo[i] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+ }
+
// Record the value to return.
StoreOperands.push_back(RetVal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955..b3517ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 904f1d7..c922098 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
MVT SubVecContainerVT = SubVecVT;
// Establish the correct scalable-vector types for any fixed-length type.
- if (SubVecVT.isFixedLengthVector())
+ if (SubVecVT.isFixedLengthVector()) {
+ assert(Idx == 0 && V.isUndef());
SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+ }
if (VT.isFixedLengthVector())
VT = TLI.getContainerForFixedLengthVector(VT);
@@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
MVT SubVecContainerVT = VT;
// Establish the correct scalable-vector types for any fixed-length type.
- if (VT.isFixedLengthVector())
+ if (VT.isFixedLengthVector()) {
+ assert(Idx == 0);
SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT);
+ }
if (InVT.isFixedLengthVector())
InVT = TLI.getContainerForFixedLengthVector(InVT);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7275eb..540c2e7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND,
ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN,
ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX,
- ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
+ ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
+ ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT,
+ ISD::VP_USUBSAT};
static const unsigned FloatingPointVPOps[] = {
ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL,
@@ -830,7 +832,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
VT, Custom);
setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,
Custom);
- setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT,
ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT},
VT, Legal);
@@ -956,6 +957,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// between vXf16 and vXf64 must be lowered as sequences which convert via
// vXf32.
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+ setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
// Custom-lower insert/extract operations to simplify patterns.
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
Custom);
@@ -3240,45 +3242,49 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
// Note that this method will also match potentially unappealing index
// sequences, like <i32 0, i32 50939494>, however it is left to the caller to
// determine whether this is worth generating code for.
-static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
- unsigned NumElts = Op.getNumOperands();
+static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
+ unsigned EltSizeInBits) {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
+ if (!cast<BuildVectorSDNode>(Op)->isConstant())
+ return std::nullopt;
bool IsInteger = Op.getValueType().isInteger();
std::optional<unsigned> SeqStepDenom;
std::optional<int64_t> SeqStepNum, SeqAddend;
std::optional<std::pair<uint64_t, unsigned>> PrevElt;
- unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
- for (unsigned Idx = 0; Idx < NumElts; Idx++) {
- // Assume undef elements match the sequence; we just have to be careful
- // when interpolating across them.
- if (Op.getOperand(Idx).isUndef())
+ assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
+
+ // First extract the ops into a list of constant integer values. This may not
+ // be possible for floats if they're not all representable as integers.
+ SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+ const unsigned OpSize = Op.getScalarValueSizeInBits();
+ for (auto [Idx, Elt] : enumerate(Op->op_values())) {
+ if (Elt.isUndef()) {
+ Elts[Idx] = std::nullopt;
continue;
-
- uint64_t Val;
+ }
if (IsInteger) {
- // The BUILD_VECTOR must be all constants.
- if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
- return std::nullopt;
- Val = Op.getConstantOperandVal(Idx) &
- maskTrailingOnes<uint64_t>(EltSizeInBits);
+ Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
} else {
- // The BUILD_VECTOR must be all constants.
- if (!isa<ConstantFPSDNode>(Op.getOperand(Idx)))
- return std::nullopt;
- if (auto ExactInteger = getExactInteger(
- cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
- EltSizeInBits))
- Val = *ExactInteger;
- else
+ auto ExactInteger =
+ getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
+ if (!ExactInteger)
return std::nullopt;
+ Elts[Idx] = *ExactInteger;
}
+ }
+
+ for (auto [Idx, Elt] : enumerate(Elts)) {
+ // Assume undef elements match the sequence; we just have to be careful
+ // when interpolating across them.
+ if (!Elt)
+ continue;
if (PrevElt) {
// Calculate the step since the last non-undef element, and ensure
// it's consistent across the entire sequence.
unsigned IdxDiff = Idx - PrevElt->second;
- int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+ int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
// A zero-value value difference means that we're somewhere in the middle
// of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3308,8 +3314,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
}
// Record this non-undef element for later.
- if (!PrevElt || PrevElt->first != Val)
- PrevElt = std::make_pair(Val, Idx);
+ if (!PrevElt || PrevElt->first != *Elt)
+ PrevElt = std::make_pair(*Elt, Idx);
}
// We need to have logged a step for this to count as a legal index sequence.
@@ -3318,21 +3324,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
// Loop back through the sequence and validate elements we might have skipped
// while waiting for a valid step. While doing this, log any sequence addend.
- for (unsigned Idx = 0; Idx < NumElts; Idx++) {
- if (Op.getOperand(Idx).isUndef())
+ for (auto [Idx, Elt] : enumerate(Elts)) {
+ if (!Elt)
continue;
- uint64_t Val;
- if (IsInteger) {
- Val = Op.getConstantOperandVal(Idx) &
- maskTrailingOnes<uint64_t>(EltSizeInBits);
- } else {
- Val = *getExactInteger(
- cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
- EltSizeInBits);
- }
uint64_t ExpectedVal =
(int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
- int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
+ int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
if (!SeqAddend)
SeqAddend = Addend;
else if (Addend != SeqAddend)
@@ -3598,7 +3595,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
// Try and match index sequences, which we can lower to the vid instruction
// with optional modifications. An all-undef vector is matched by
// getSplatValue, above.
- if (auto SimpleVID = isSimpleVIDSequence(Op)) {
+ if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
int64_t StepNumerator = SimpleVID->StepNumerator;
unsigned StepDenominator = SimpleVID->StepDenominator;
int64_t Addend = SimpleVID->Addend;
@@ -3853,11 +3850,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// If we're compiling for an exact VLEN value, we can split our work per
// register in the register group.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
- if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ if (const auto VLen = Subtarget.getRealVLen();
+ VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) {
MVT ElemVT = VT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
@@ -4768,9 +4764,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
// If we don't know exact data layout, not much we can do. If this
// is already m1 or smaller, no point in splitting further.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
- if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
+ const auto VLen = Subtarget.getRealVLen();
+ if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen)
return SDValue();
// Avoid picking up bitrotate patterns which we have a linear-in-lmul
@@ -4781,7 +4776,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
return SDValue();
MVT ElemVT = VT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
SmallVector<std::pair<int, SmallVector<int>>>
@@ -5759,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) {
VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP
VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP
VP_CASE(BITREVERSE) // VP_BITREVERSE
+ VP_CASE(SADDSAT) // VP_SADDSAT
+ VP_CASE(UADDSAT) // VP_UADDSAT
+ VP_CASE(SSUBSAT) // VP_SSUBSAT
+ VP_CASE(USUBSAT) // VP_USUBSAT
VP_CASE(BSWAP) // VP_BSWAP
VP_CASE(CTLZ) // VP_CTLZ
VP_CASE(CTTZ) // VP_CTTZ
@@ -6798,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_UDIV:
case ISD::VP_SREM:
case ISD::VP_UREM:
+ case ISD::VP_UADDSAT:
+ case ISD::VP_USUBSAT:
+ case ISD::VP_SADDSAT:
+ case ISD::VP_SSUBSAT:
return lowerVPOp(Op, DAG);
case ISD::VP_AND:
case ISD::VP_OR:
@@ -7384,6 +7387,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
return V;
+ // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
+ // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
+ if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
+ const APInt &TrueVal = TrueV->getAsAPIntVal();
+ const APInt &FalseVal = FalseV->getAsAPIntVal();
+ const int TrueValCost = RISCVMatInt::getIntMatCost(
+ TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ const int FalseValCost = RISCVMatInt::getIntMatCost(
+ FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+ SDValue LHSVal = DAG.getConstant(
+ IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
+ SDValue RHSVal =
+ DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
+ SDValue CMOV =
+ DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+ DL, VT, LHSVal, CondV);
+ return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+ }
+
// (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
// Unless we have the short forward branch optimization.
if (!Subtarget.hasConditionalMoveFusion())
@@ -8313,15 +8336,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
// constant index, we can always perform the extract in m1 (or
// smaller) as we can determine the register corresponding to
// the index in the register group.
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ const auto VLen = Subtarget.getRealVLen();
if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
- IdxC && MinVLen == MaxVLen &&
- VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
MVT M1VT = getLMUL1VT(ContainerVT);
unsigned OrigIdx = IdxC->getZExtValue();
EVT ElemVT = VecVT.getVectorElementType();
- unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+ unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
unsigned RemIdx = OrigIdx % ElemsPerVReg;
unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
unsigned ExtractIdx =
@@ -9782,15 +9803,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
if (OrigIdx == 0)
return Op;
- const unsigned MinVLen = Subtarget.getRealMinVLen();
- const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ const auto VLen = Subtarget.getRealVLen();
// If the subvector vector is a fixed-length type and we don't know VLEN
// exactly, we cannot use subregister manipulation to simplify the codegen; we
// don't know which register of a LMUL group contains the specific subvector
// as we only know the minimum register size. Therefore we must slide the
// vector group down the full amount.
- if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
+ if (SubVecVT.isFixedLengthVector() && !VLen) {
MVT ContainerVT = VecVT;
if (VecVT.isFixedLengthVector()) {
ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9837,8 +9857,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
// and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
// we have a fixed length subvector, we need to adjust the index by 1/vscale.
if (SubVecVT.isFixedLengthVector()) {
- assert(MinVLen == MaxVLen);
- unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+ assert(VLen);
+ unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
auto Decompose =
RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
@@ -12872,6 +12892,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineSubOfBoolean(N, DAG))
return V;
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
@@ -12879,7 +12900,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
isNullConstant(N1.getOperand(1))) {
ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
if (CCVal == ISD::SETLT) {
- EVT VT = N->getValueType(0);
SDLoc DL(N);
unsigned ShAmt = N0.getValueSizeInBits() - 1;
return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0),
@@ -12887,6 +12907,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // sub (zext, zext) -> sext (sub (zext, zext))
+ // where the sum of the extend widths match, and the inner zexts
+ // add at least one bit. (For profitability on rvv, we use a
+ // power of two for both inner and outer extend.)
+ if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) &&
+ N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND &&
+ N0.hasOneUse() && N1.hasOneUse()) {
+ SDValue Src0 = N0.getOperand(0);
+ SDValue Src1 = N1.getOperand(0);
+ EVT SrcVT = Src0.getValueType();
+ if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) &&
+ SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 &&
+ SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) {
+ LLVMContext &C = *DAG.getContext();
+ EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C);
+ EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount());
+ Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0);
+ Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1);
+ return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
+ DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1));
+ }
+ }
+
// fold (sub x, (select lhs, rhs, cc, 0, y)) ->
// (select lhs, rhs, cc, x, (sub x, y))
return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget);
@@ -15978,7 +16021,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (Index.getOpcode() == ISD::BUILD_VECTOR &&
MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
- if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
+ // The sequence will be XLenVT, not the type of Index. Tell
+ // isSimpleVIDSequence this so we avoid overflow.
+ if (std::optional<VIDSequence> SimpleVID =
+ isSimpleVIDSequence(Index, Subtarget.getXLen());
SimpleVID && SimpleVID->StepDenominator == 1) {
const int64_t StepNumerator = SimpleVID->StepNumerator;
const int64_t Addend = SimpleVID->Addend;
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index ff21fe1..af864ba 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) {
// Return true if MI is a load for which there exists a compressed version.
static bool isCompressibleLoad(const MachineInstr &MI) {
const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
- const unsigned Opcode = MI.getOpcode();
- return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) ||
- Opcode == RISCV::LD || Opcode == RISCV::FLD;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::LW:
+ case RISCV::LD:
+ return STI.hasStdExtCOrZca();
+ case RISCV::FLW:
+ return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+ case RISCV::FLD:
+ return STI.hasStdExtCOrZcd();
+ }
}
// Return true if MI is a store for which there exists a compressed version.
static bool isCompressibleStore(const MachineInstr &MI) {
const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
- const unsigned Opcode = MI.getOpcode();
- return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) ||
- Opcode == RISCV::SD || Opcode == RISCV::FSD;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::SW:
+ case RISCV::SD:
+ return STI.hasStdExtCOrZca();
+ case RISCV::FSW:
+ return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+ case RISCV::FSD:
+ return STI.hasStdExtCOrZcd();
+ }
}
// Find a single register and/or large offset which, if compressible, would
@@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
const RISCVInstrInfo &TII = *STI.getInstrInfo();
// This optimization only makes sense if compressed instructions are emitted.
- // FIXME: Support Zca, Zcf, Zcd granularity.
- if (!STI.hasStdExtC())
+ if (!STI.hasStdExtCOrZca())
return false;
for (MachineBasicBlock &MBB : Fn) {
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index d15cb61..0be681d 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
let ReleaseAtCycles = noPredReleaseCycles;
}
+ // Define SchedVars
+ def nameMX # PredSchedVar
+ : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>;
+ def nameMX # NoPredSchedVar
+ : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>;
+ // Allow multiclass to refer to SchedVars -- need to have NAME prefix.
+ defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar);
+ defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar);
+
// Tie behavior to predicate
- def NAME # nameMX # "_Variant" : SchedWriteVariant<[
- SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
- SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
- ]>;
+ def NAME # nameMX # "_Variant"
+ : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
def : SchedAlias<
!cast<SchedReadWrite>(nameMX),
!cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
if IsWorstCase then {
- def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[
- SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
- SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
- ]>;
+ def NAME # name # "_WorstCase_Variant"
+ : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
def : SchedAlias<
!cast<SchedReadWrite>(name # "_WorstCase"),
!cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b60d7a..9ebf278 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -143,6 +143,10 @@ public:
#include "RISCVGenSubtargetInfo.inc"
bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; }
+ bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; }
+ bool hasStdExtCOrZcfOrZce() const {
+ return HasStdExtC || HasStdExtZcf || HasStdExtZce;
+ }
bool hasStdExtZvl() const { return ZvlLen != 0; }
bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; }
bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index adef40e..3e20e45 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination(
static cl::opt<bool>
EnableSinkFold("riscv-enable-sink-fold",
cl::desc("Enable sinking and folding of instruction copies"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
static cl::opt<bool>
EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden,
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index e6e3560..28a63b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> {
!eq(operation, OpGroupNonUniformShuffleDown),
!eq(operation, OpGroupBroadcast),
!eq(operation, OpGroupNonUniformBroadcast),
- !eq(operation, OpGroupNonUniformBroadcastFirst));
+ !eq(operation, OpGroupNonUniformBroadcastFirst),
+ !eq(operation, OpGroupNonUniformRotateKHR));
bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical);
}
@@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
+// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate
+defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+
// cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions
defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>;
defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index cc438b2..10569ef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
SPIRVGlobalRegistry *GR,
- MachineIRBuilder &MIRBuilder) {
+ MachineIRBuilder &MIRBuilder,
+ const SPIRVSubtarget &ST) {
// Read argument's access qualifier from metadata or default.
SPIRV::AccessQualifier::AccessQualifier ArgAccessQual =
getArgAccessQual(F, ArgIdx);
@@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
if (MDTypeStr.ends_with("*"))
ResArgType = GR->getOrCreateSPIRVTypeByName(
MDTypeStr, MIRBuilder,
- addressSpaceToStorageClass(
- OriginalArgType->getPointerAddressSpace()));
+ addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(),
+ ST));
else if (MDTypeStr.ends_with("_t"))
ResArgType = GR->getOrCreateSPIRVTypeByName(
"opencl." + MDTypeStr.str(), MIRBuilder,
@@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
assert(GR && "Must initialize the SPIRV type registry before lowering args.");
GR->setCurrentFunc(MIRBuilder.getMF());
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+
// Assign types and names to all args, and store their types for later.
FunctionType *FTy = getOriginalFunctionType(F);
SmallVector<SPIRVType *, 4> ArgTypeVRegs;
@@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
// TODO: handle the case of multiple registers.
if (VRegs[i].size() > 1)
return false;
- auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder);
+ auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST);
GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF());
ArgTypeVRegs.push_back(SpirvTy);
@@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (F.hasName())
buildOpName(FuncVReg, F.getName(), MIRBuilder);
- // Get access to information about available extensions
- const auto *ST =
- static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-
// Handle entry points and function linkage.
if (isEntryPoint(F)) {
const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47fec74..a1cb630 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
// TODO: change the implementation once opaque pointers are supported
// in the SPIR-V specification.
SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
- auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+ auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST);
// Null pointer means we have a loop in type definitions, make and
// return corresponding OpTypeForwardPointer.
if (SpvElementType == nullptr) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index f317b26..d34f802 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -31,6 +31,9 @@ public:
return true;
}
+ // prevent creation of jump tables
+ bool areJTsAllowed(const Function *) const override { return false; }
+
// This is to prevent sexts of non-i64 vector indices which are generated
// within general IRTranslator hence type generation for it is omitted.
MVT getVectorIdxTy(const DataLayout &DL) const override {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 0f11bc3..7c5252e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor
"$r = OpGenericCastToPtrExplicit $t $p $s">;
def OpBitcast : UnOp<"OpBitcast", 124>;
+// SPV_INTEL_usm_storage_classes
+def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>;
+def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>;
+
// 3.42.12 Composite Instructions
def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
@@ -765,6 +769,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
+// SPV_KHR_subgroup_rotate
+def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
+ (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
+ "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
+
// 3.49.7, Constant-Creation Instructions
// - SPV_INTEL_function_pointers
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 53d19a1..7258d3b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) {
}
}
+static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
+ switch (SC) {
+ case SPIRV::StorageClass::DeviceOnlyINTEL:
+ case SPIRV::StorageClass::HostOnlyINTEL:
+ return true;
+ default:
+ return false;
+ }
+}
+
// In SPIR-V address space casting can only happen to and from the Generic
-// storage class. We can also only case Workgroup, CrossWorkgroup, or Function
+// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function
// pointers to and from Generic pointers. As such, we can convert e.g. from
// Workgroup to Function by going via a Generic pointer as an intermediary. All
// other combinations can only be done by a bitcast, and are probably not safe.
@@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
- // Casting from an eligable pointer to Generic.
+ // don't generate a cast between identical storage classes
+ if (SrcSC == DstSC)
+ return true;
+
+ // Casting from an eligible pointer to Generic.
if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
- // Casting from Generic to an eligable pointer.
+ // Casting from Generic to an eligible pointer.
if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC))
return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr);
- // Casting between 2 eligable pointers using Generic as an intermediary.
+ // Casting between 2 eligible pointers using Generic as an intermediary.
if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
@@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
.addUse(Tmp)
.constrainAllUses(TII, TRI, RBI);
}
+
+ // Check if instructions from the SPV_INTEL_usm_storage_classes extension may
+ // be applied
+ if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup)
+ return selectUnOp(ResVReg, ResType, I,
+ SPIRV::OpPtrCastToCrossWorkgroupINTEL);
+ if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC))
+ return selectUnOp(ResVReg, ResType, I,
+ SPIRV::OpCrossWorkgroupCastToPtrINTEL);
+
// TODO Should this case just be disallowed completely?
// We're casting 2 other arbitrary address spaces, so have to bitcast.
return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
@@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
}
SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(
PointerBaseType, I, TII,
- addressSpaceToStorageClass(GV->getAddressSpace()));
+ addressSpaceToStorageClass(GV->getAddressSpace(), STI));
std::string GlobalIdent;
if (!GV->hasName()) {
@@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
unsigned AddrSpace = GV->getAddressSpace();
SPIRV::StorageClass::StorageClass Storage =
- addressSpaceToStorageClass(AddrSpace);
+ addressSpaceToStorageClass(AddrSpace, STI);
bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
Storage != SPIRV::StorageClass::Function;
SPIRV::LinkageType::LinkageType LnkType =
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 011a550..4f2e7a2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
const LLT p2 = LLT::pointer(2, PSize); // UniformConstant
const LLT p3 = LLT::pointer(3, PSize); // Workgroup
const LLT p4 = LLT::pointer(4, PSize); // Generic
- const LLT p5 = LLT::pointer(5, PSize); // Input
+ const LLT p5 =
+ LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device)
+ const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host)
// TODO: remove copy-pasting here by using concatenation in some way.
auto allPtrsScalarsAndVectors = {
- p0, p1, p2, p3, p4, p5, s1, s8, s16,
- s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8,
- v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1,
- v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+ p0, p1, p2, p3, p4, p5, p6, s1, s8, s16,
+ s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16,
+ v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16,
+ v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
auto allScalarsAndVectors = {
s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64,
@@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
auto allFloatAndIntScalars = allIntScalars;
- auto allPtrs = {p0, p1, p2, p3, p4, p5};
- auto allWritablePtrs = {p0, p1, p3, p4};
+ auto allPtrs = {p0, p1, p2, p3, p4, p5, p6};
+ auto allWritablePtrs = {p0, p1, p3, p4, p5, p6};
for (auto Opc : TypeFoldingSupportingOpcs)
getActionDefinitionsBuilder(Opc).custom();
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index dbda287..3be28c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1063,12 +1063,28 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
}
break;
+ case SPIRV::OpPtrCastToCrossWorkgroupINTEL:
+ case SPIRV::OpCrossWorkgroupCastToPtrINTEL:
+ if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) {
+ Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes);
+ Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL);
+ }
+ break;
case SPIRV::OpConstantFunctionPointerINTEL:
if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
}
break;
+ case SPIRV::OpGroupNonUniformRotateKHR:
+ if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate))
+ report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the "
+ "following SPIR-V extension: SPV_KHR_subgroup_rotate",
+ false);
+ Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate);
+ Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR);
+ Reqs.addCapability(SPIRV::Capability::GroupNonUniform);
+ break;
case SPIRV::OpGroupIMulKHR:
case SPIRV::OpGroupFMulKHR:
case SPIRV::OpGroupBitwiseAndKHR:
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cbc16fa..1442168 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineIRBuilder MIB) {
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
SmallVector<MachineInstr *, 10> ToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
@@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
- addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+ addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST));
// If the bitcast would be redundant, replace all uses with the source
// register.
@@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineIRBuilder MIB) {
+ // Get access to information about available extensions
+ const SPIRVSubtarget *ST =
+ static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
+
MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<MachineInstr *, 10> ToErase;
@@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB);
SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
- addressSpaceToStorageClass(MI.getOperand(3).getImm()));
+ addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST));
MachineInstr *Def = MRI.getVRegDef(Reg);
assert(Def && "Expecting an instruction that defines the register");
insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index e186154..79f1614 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions(
clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
"Adds OptNoneINTEL value for Function Control mask that "
"indicates a request to not optimize the function."),
+ clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes,
+ "SPV_INTEL_usm_storage_classes",
+ "Introduces two new storage classes that are sub classes of "
+ "the CrossWorkgroup storage class "
+ "that provides additional information that can enable "
+ "optimization."),
clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
"Allows work items in a subgroup to share data without the "
"use of local memory and work group barriers, and to "
@@ -75,6 +81,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
"Allows to use the LinkOnceODR linkage type that is to let "
"a function or global variable to be merged with other functions "
"or global variables of the same name when linkage occurs."),
+ clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate,
+ "SPV_KHR_subgroup_rotate",
+ "Adds a new instruction that enables rotating values across "
+ "invocations within a subgroup."),
clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
"SPV_INTEL_function_pointers",
"Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 4e5ac0d..b022b97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>;
@@ -462,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom
defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
+defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
@@ -699,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
+defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>;
+defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Dim enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 05f766d..169d7cc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/SPIRVBaseInfo.h"
#include "SPIRV.h"
#include "SPIRVInstrInfo.h"
+#include "SPIRVSubtarget.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
return 3;
case SPIRV::StorageClass::Generic:
return 4;
+ case SPIRV::StorageClass::DeviceOnlyINTEL:
+ return 5;
+ case SPIRV::StorageClass::HostOnlyINTEL:
+ return 6;
case SPIRV::StorageClass::Input:
return 7;
default:
- llvm_unreachable("Unable to get address space id");
+ report_fatal_error("Unable to get address space id");
}
}
SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace) {
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
switch (AddrSpace) {
case 0:
return SPIRV::StorageClass::Function;
@@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) {
return SPIRV::StorageClass::Workgroup;
case 4:
return SPIRV::StorageClass::Generic;
+ case 5:
+ return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+ ? SPIRV::StorageClass::DeviceOnlyINTEL
+ : SPIRV::StorageClass::CrossWorkgroup;
+ case 6:
+ return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+ ? SPIRV::StorageClass::HostOnlyINTEL
+ : SPIRV::StorageClass::CrossWorkgroup;
case 7:
return SPIRV::StorageClass::Input;
default:
- llvm_unreachable("Unknown address space");
+ report_fatal_error("Unknown address space");
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index a33dc02..1af53dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -27,6 +27,7 @@ class MachineRegisterInfo;
class Register;
class StringRef;
class SPIRVInstrInfo;
+class SPIRVSubtarget;
// Add the given string as a series of integer operand, inserting null
// terminators and padding to make sure the operands all have 32-bit
@@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC);
// Convert an LLVM IR address space to a SPIR-V storage class.
SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace);
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI);
SPIRV::MemorySemantics::MemorySemantics
getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790..36f0679 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,6 +43,8 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-lower"
+extern cl::opt<bool> WasmEmitMultiValue;
+
WebAssemblyTargetLowering::WebAssemblyTargetLowering(
const TargetMachine &TM, const WebAssemblySubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext & /*Context*/) const {
// WebAssembly can only handle returning tuples with multivalue enabled
- return Subtarget->hasMultivalue() || Outs.size() <= 1;
+ return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
}
SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
- assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+ assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
+ Outs.size() <= 1) &&
"MVP WebAssembly can only return up to one value");
if (!callingConvSupported(CallConv))
fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e95911..b969b83 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,6 +22,8 @@
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
+extern cl::opt<bool> WasmEmitMultiValue;
+
WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
if (Results.size() > 1 &&
- !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
+ (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
+ !WasmEmitMultiValue)) {
// WebAssembly can't lower returns of multiple values without demoting to
// sret unless multivalue is enabled (see
// WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029..2a84c90 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,6 +24,8 @@
using namespace llvm;
+extern cl::opt<bool> WasmEmitMultiValue;
+
namespace {
enum RuntimeLibcallSignature {
@@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(PtrTy);
break;
case i64_i64_func_f32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::F32);
break;
case i64_i64_func_f64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::F64);
break;
case i16_i16_func_i16_i16:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
} else {
@@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i32_i32_func_i32_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
} else {
@@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i64_i64_func_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64_iPTR:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(PtrTy);
break;
case i64_i64_i64_i64_func_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
@@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i64_i64_i64_i64_i64_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
break;
case i64_i64_func_i32:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
@@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I32);
break;
case i64_i64_func_i64:
- if (Subtarget.hasMultivalue()) {
+ if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
} else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 42043a7..3120b6b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
" irreducible control flow optimization pass"),
cl::init(false));
+// A temporary option to control emission of multivalue until multivalue
+// implementation is stable enough. We currently don't emit multivalue by
+// default even if the feature section allows it.
+// TODO Stabilize multivalue and delete this option
+cl::opt<bool>
+ WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
+ cl::desc("WebAssembly: Emit multivalue in the backend"),
+ cl::init(false));
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
// Register the target.
RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 4a11dd2..a620ba9 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
} // namespace
Error X86TargetMachine::buildCodeGenPipeline(
- ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
- MachineFunctionAnalysisManager &, raw_pwrite_stream &Out,
- raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
- CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) {
+ ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+ CodeGenFileType FileType, CGPassBuilderOption Opt,
+ PassInstrumentationCallbacks *PIC) {
auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
- return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType);
+ return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index f31c971..0fd3e47 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,10 +58,9 @@ public:
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
- Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &,
- MachineFunctionAnalysisManager &,
- raw_pwrite_stream &, raw_pwrite_stream *,
- CodeGenFileType, CGPassBuilderOption,
+ Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+ raw_pwrite_stream *, CodeGenFileType,
+ CGPassBuilderOption,
PassInstrumentationCallbacks *) override;
bool isJIT() const { return IsJIT; }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 4466d50..a4cc757 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1846,6 +1846,13 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
Features["usermsr"] = HasLeaf7Subleaf1 && ((EDX >> 15) & 1);
Features["avx10.1-256"] = HasLeaf7Subleaf1 && ((EDX >> 19) & 1);
+ bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1);
+ Features["egpr"] = HasAPXF;
+ Features["push2pop2"] = HasAPXF;
+ Features["ppx"] = HasAPXF;
+ Features["ndd"] = HasAPXF;
+ Features["ccmp"] = HasAPXF;
+ Features["cf"] = HasAPXF;
bool HasLeafD = MaxLevel >= 0xd &&
!getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index fabb3c5f..5aefcbf 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -215,15 +215,10 @@ bool HotColdSplitting::isFunctionCold(const Function &F) const {
return false;
}
-bool HotColdSplitting::isBasicBlockCold(BasicBlock *BB,
- BranchProbability ColdProbThresh,
- SmallPtrSetImpl<BasicBlock *> &ColdBlocks,
- SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
- BlockFrequencyInfo *BFI) const {
- // This block is already part of some outlining region.
- if (ColdBlocks.count(BB))
- return true;
-
+bool HotColdSplitting::isBasicBlockCold(
+ BasicBlock *BB, BranchProbability ColdProbThresh,
+ SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
+ BlockFrequencyInfo *BFI) const {
if (BFI) {
if (PSI->isColdBlock(BB, BFI))
return true;
@@ -372,18 +367,12 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
return Penalty;
}
-Function *HotColdSplitting::extractColdRegion(
- const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
- DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
- OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
+// Determine if it is beneficial to split the \p Region.
+bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE,
+ const BlockSequence &Region,
+ TargetTransformInfo &TTI) {
assert(!Region.empty());
- // TODO: Pass BFI and BPI to update profile information.
- CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
- /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
- /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
- /* Suffix */ "cold." + std::to_string(Count));
-
// Perform a simple cost/benefit analysis to decide whether or not to permit
// splitting.
SetVector<Value *> Inputs, Outputs, Sinks;
@@ -394,9 +383,18 @@ Function *HotColdSplitting::extractColdRegion(
LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
<< ", penalty = " << OutliningPenalty << "\n");
if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
- return nullptr;
+ return false;
- Function *OrigF = Region[0]->getParent();
+ return true;
+}
+
+// Split the single \p EntryPoint cold region. \p CE is the region code
+// extractor.
+Function *HotColdSplitting::extractColdRegion(
+ BasicBlock &EntryPoint, CodeExtractor &CE,
+ const CodeExtractorAnalysisCache &CEAC, BlockFrequencyInfo *BFI,
+ TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE) {
+ Function *OrigF = EntryPoint.getParent();
if (Function *OutF = CE.extractCodeRegion(CEAC)) {
User *U = *OutF->user_begin();
CallInst *CI = cast<CallInst>(U);
@@ -419,7 +417,7 @@ Function *HotColdSplitting::extractColdRegion(
LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
- &*Region[0]->begin())
+ &*EntryPoint.begin())
<< ore::NV("Original", OrigF) << " split cold code into "
<< ore::NV("Split", OutF);
});
@@ -428,9 +426,9 @@ Function *HotColdSplitting::extractColdRegion(
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &*Region[0]->begin())
+ &*EntryPoint.begin())
<< "Failed to extract region at block "
- << ore::NV("Block", Region.front());
+ << ore::NV("Block", &EntryPoint);
});
return nullptr;
}
@@ -620,16 +618,18 @@ public:
} // namespace
bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
- bool Changed = false;
-
- // The set of cold blocks.
+ // The set of cold blocks outlined.
SmallPtrSet<BasicBlock *, 4> ColdBlocks;
+ // The set of cold blocks cannot be outlined.
+ SmallPtrSet<BasicBlock *, 4> CannotBeOutlinedColdBlocks;
+
// Set of cold blocks obtained with RPOT.
SmallPtrSet<BasicBlock *, 4> AnnotatedColdBlocks;
- // The worklist of non-intersecting regions left to outline.
- SmallVector<OutliningRegion, 2> OutliningWorklist;
+ // The worklist of non-intersecting regions left to outline. The first member
+ // of the pair is the entry point into the region to be outlined.
+ SmallVector<std::pair<BasicBlock *, CodeExtractor>, 2> OutliningWorklist;
// Set up an RPO traversal. Experimentally, this performs better (outlines
// more) than a PO traversal, because we prevent region overlap by keeping
@@ -655,10 +655,18 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
if (ColdBranchProbDenom.getNumOccurrences())
ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue());
+ unsigned OutlinedFunctionID = 1;
// Find all cold regions.
for (BasicBlock *BB : RPOT) {
- if (!isBasicBlockCold(BB, ColdProbThresh, ColdBlocks, AnnotatedColdBlocks,
- BFI))
+ // This block is already part of some outlining region.
+ if (ColdBlocks.count(BB))
+ continue;
+
+ // This block is already part of some region cannot be outlined.
+ if (CannotBeOutlinedColdBlocks.count(BB))
+ continue;
+
+ if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI))
continue;
LLVM_DEBUG({
@@ -681,50 +689,69 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
return markFunctionCold(F);
}
- // If this outlining region intersects with another, drop the new region.
- //
- // TODO: It's theoretically possible to outline more by only keeping the
- // largest region which contains a block, but the extra bookkeeping to do
- // this is tricky/expensive.
- bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
- return !ColdBlocks.insert(Block.first).second;
- });
- if (RegionsOverlap)
- continue;
+ do {
+ BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
+ LLVM_DEBUG({
+ dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+ for (BasicBlock *BB : SubRegion)
+ BB->dump();
+ });
+
+ // TODO: Pass BFI and BPI to update profile information.
+ CodeExtractor CE(
+ SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+ /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
+ /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
+ /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
+
+ if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
+ // If this outlining region intersects with another, drop the new
+ // region.
+ //
+ // TODO: It's theoretically possible to outline more by only keeping
+ // the largest region which contains a block, but the extra
+ // bookkeeping to do this is tricky/expensive.
+ none_of(SubRegion, [&](BasicBlock *Block) {
+ return ColdBlocks.contains(Block);
+ })) {
+ ColdBlocks.insert(SubRegion.begin(), SubRegion.end());
+
+ LLVM_DEBUG({
+ for (auto *Block : SubRegion)
+ dbgs() << " contains cold block:" << Block->getName() << "\n";
+ });
+
+ OutliningWorklist.emplace_back(
+ std::make_pair(SubRegion[0], std::move(CE)));
+ ++OutlinedFunctionID;
+ } else {
+ // The cold block region cannot be outlined.
+ for (auto *Block : SubRegion)
+ if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) ||
+ (PDT->dominates(BB, Block) && DT->dominates(Block, BB)))
+ // Will skip this cold block in the loop to save the compile time
+ CannotBeOutlinedColdBlocks.insert(Block);
+ }
+ } while (!Region.empty());
- OutliningWorklist.emplace_back(std::move(Region));
++NumColdRegionsFound;
}
}
if (OutliningWorklist.empty())
- return Changed;
+ return false;
// Outline single-entry cold regions, splitting up larger regions as needed.
- unsigned OutlinedFunctionID = 1;
// Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
CodeExtractorAnalysisCache CEAC(F);
- do {
- OutliningRegion Region = OutliningWorklist.pop_back_val();
- assert(!Region.empty() && "Empty outlining region in worklist");
- do {
- BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
- LLVM_DEBUG({
- dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
- for (BasicBlock *BB : SubRegion)
- BB->dump();
- });
-
- Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
- ORE, AC, OutlinedFunctionID);
- if (Outlined) {
- ++OutlinedFunctionID;
- Changed = true;
- }
- } while (!Region.empty());
- } while (!OutliningWorklist.empty());
+ for (auto &BCE : OutliningWorklist) {
+ Function *Outlined =
+ extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE);
+ assert(Outlined && "Should be outlined");
+ (void)Outlined;
+ }
- return Changed;
+ return true;
}
bool HotColdSplitting::run(Module &M) {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 4176d56..77ca36d 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1471,7 +1471,6 @@ private:
OMPRTL_omp_get_num_threads,
OMPRTL_omp_in_parallel,
OMPRTL_omp_get_cancellation,
- OMPRTL_omp_get_thread_limit,
OMPRTL_omp_get_supported_active_levels,
OMPRTL_omp_get_level,
OMPRTL_omp_get_ancestor_thread_num,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ed47de2..33ed1d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1543,11 +1543,14 @@ static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
return !losesInfo;
}
-static Type *shrinkFPConstant(ConstantFP *CFP) {
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
return nullptr; // No constant folding of this.
+ // See if the value can be truncated to bfloat and then reextended.
+ if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
+ return Type::getBFloatTy(CFP->getContext());
// See if the value can be truncated to half and then reextended.
- if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+ if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
return Type::getHalfTy(CFP->getContext());
// See if the value can be truncated to float and then reextended.
if (fitsInFPType(CFP, APFloat::IEEEsingle()))
@@ -1562,7 +1565,7 @@ static Type *shrinkFPConstant(ConstantFP *CFP) {
// Determine if this is a vector of ConstantFPs and if so, return the minimal
// type we can safely truncate all elements to.
-static Type *shrinkFPConstantVector(Value *V) {
+static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
auto *CV = dyn_cast<Constant>(V);
auto *CVVTy = dyn_cast<FixedVectorType>(V->getType());
if (!CV || !CVVTy)
@@ -1582,7 +1585,7 @@ static Type *shrinkFPConstantVector(Value *V) {
if (!CFP)
return nullptr;
- Type *T = shrinkFPConstant(CFP);
+ Type *T = shrinkFPConstant(CFP, PreferBFloat);
if (!T)
return nullptr;
@@ -1597,7 +1600,7 @@ static Type *shrinkFPConstantVector(Value *V) {
}
/// Find the minimum FP type we can safely truncate to.
-static Type *getMinimumFPType(Value *V) {
+static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
if (auto *FPExt = dyn_cast<FPExtInst>(V))
return FPExt->getOperand(0)->getType();
@@ -1605,7 +1608,7 @@ static Type *getMinimumFPType(Value *V) {
// that can accurately represent it. This allows us to turn
// (float)((double)X+2.0) into x+2.0f.
if (auto *CFP = dyn_cast<ConstantFP>(V))
- if (Type *T = shrinkFPConstant(CFP))
+ if (Type *T = shrinkFPConstant(CFP, PreferBFloat))
return T;
// We can only correctly find a minimum type for a scalable vector when it is
@@ -1617,7 +1620,7 @@ static Type *getMinimumFPType(Value *V) {
// Try to shrink a vector of FP constants. This returns nullptr on scalable
// vectors
- if (Type *T = shrinkFPConstantVector(V))
+ if (Type *T = shrinkFPConstantVector(V, PreferBFloat))
return T;
return V->getType();
@@ -1686,8 +1689,10 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
Type *Ty = FPT.getType();
auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
if (BO && BO->hasOneUse()) {
- Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
- Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+ Type *LHSMinType =
+ getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
+ Type *RHSMinType =
+ getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
unsigned OpWidth = BO->getType()->getFPMantissaWidth();
unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4af455c..87c8dca 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2387,6 +2387,20 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
return NonNull;
}
+ if (match(V, m_SExtLike(m_Value(A)))) {
+ if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+ DoesConsume, Depth))
+ return Builder ? Builder->CreateSExt(AV, V->getType()) : NonNull;
+ return nullptr;
+ }
+
+ if (match(V, m_Trunc(m_Value(A)))) {
+ if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+ DoesConsume, Depth))
+ return Builder ? Builder->CreateTrunc(AV, V->getType()) : NonNull;
+ return nullptr;
+ }
+
return nullptr;
}
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 393afc9..33add6d 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -348,7 +348,7 @@ private:
void instrumentGlobals();
Value *getPC(IRBuilder<> &IRB);
- Value *getSP(IRBuilder<> &IRB);
+ Value *getFP(IRBuilder<> &IRB);
Value *getFrameRecordInfo(IRBuilder<> &IRB);
void instrumentPersonalityFunctions();
@@ -1148,7 +1148,7 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
// Extract some entropy from the stack pointer for the tags.
// Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
// between functions).
- Value *StackPointerLong = getSP(IRB);
+ Value *StackPointerLong = getFP(IRB);
Value *StackTag =
applyTagMask(IRB, IRB.CreateXor(StackPointerLong,
IRB.CreateLShr(StackPointerLong, 20)));
@@ -1165,7 +1165,7 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
}
Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) {
- Value *StackPointerLong = getSP(IRB);
+ Value *StackPointerLong = getFP(IRB);
Value *UARTag =
applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift));
@@ -1232,7 +1232,7 @@ Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) {
return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy);
}
-Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) {
+Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) {
if (!CachedSP) {
// FIXME: use addressofreturnaddress (but implement it in aarch64 backend
// first).
@@ -1251,7 +1251,7 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) {
Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) {
// Prepare ring buffer data.
Value *PC = getPC(IRB);
- Value *SP = getSP(IRB);
+ Value *SP = getFP(IRB);
// Mix SP and PC.
// Assumptions:
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index db05c63..9b6a39e 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -499,6 +499,8 @@ static Decomposition decompose(Value *V,
if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64)
return V;
+ bool IsKnownNonNegative = false;
+
// Decompose \p V used with a signed predicate.
if (IsSigned) {
if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -507,6 +509,14 @@ static Decomposition decompose(Value *V,
}
Value *Op0;
Value *Op1;
+
+ if (match(V, m_SExt(m_Value(Op0))))
+ V = Op0;
+ else if (match(V, m_NNegZExt(m_Value(Op0)))) {
+ V = Op0;
+ IsKnownNonNegative = true;
+ }
+
if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1))))
return MergeResults(Op0, Op1, IsSigned);
@@ -529,7 +539,7 @@ static Decomposition decompose(Value *V,
}
}
- return V;
+ return {V, IsKnownNonNegative};
}
if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -539,7 +549,6 @@ static Decomposition decompose(Value *V,
}
Value *Op0;
- bool IsKnownNonNegative = false;
if (match(V, m_ZExt(m_Value(Op0)))) {
IsKnownNonNegative = true;
V = Op0;
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 9235850..6ce9eb3 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -47,11 +47,6 @@ using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
-static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned(
- "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden,
- cl::desc("Enables canonicalization of signed relational predicates to "
- "unsigned (e.g. sgt => ugt)"));
-
STATISTIC(NumPhis, "Number of phis propagated");
STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
STATISTIC(NumSelects, "Number of selects propagated");
@@ -90,6 +85,8 @@ STATISTIC(NumSaturating,
"Number of saturating arithmetics converted to normal arithmetics");
STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null");
STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed");
+STATISTIC(NumSMinMax,
+ "Number of llvm.s{min,max} intrinsics simplified to unsigned");
STATISTIC(NumUDivURemsNarrowedExpanded,
"Number of bound udiv's/urem's expanded");
STATISTIC(NumZExt, "Number of non-negative deductions");
@@ -289,9 +286,6 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
}
static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
- if (!CanonicalizeICmpPredicatesToUnsigned)
- return false;
-
// Only for signed relational comparisons of scalar integers.
if (Cmp->getType()->isVectorTy() ||
!Cmp->getOperand(0)->getType()->isIntegerTy())
@@ -528,17 +522,40 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) {
}
// See if this min/max intrinsic always picks it's one specific operand.
+// If not, check whether we can canonicalize signed minmax into unsigned version
static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) {
CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate());
- LazyValueInfo::Tristate Result = LVI->getPredicateAt(
- Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true);
- if (Result == LazyValueInfo::Unknown)
- return false;
+ ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0),
+ /*UndefAllowed*/ false);
+ ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1),
+ /*UndefAllowed*/ false);
+ if (LHS_CR.icmp(Pred, RHS_CR)) {
+ ++NumMinMax;
+ MM->replaceAllUsesWith(MM->getLHS());
+ MM->eraseFromParent();
+ return true;
+ }
+ if (RHS_CR.icmp(Pred, LHS_CR)) {
+ ++NumMinMax;
+ MM->replaceAllUsesWith(MM->getRHS());
+ MM->eraseFromParent();
+ return true;
+ }
- ++NumMinMax;
- MM->replaceAllUsesWith(MM->getOperand(!Result));
- MM->eraseFromParent();
- return true;
+ if (MM->isSigned() &&
+ ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR,
+ RHS_CR)) {
+ ++NumSMinMax;
+ IRBuilder<> B(MM);
+ MM->replaceAllUsesWith(B.CreateBinaryIntrinsic(
+ MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin
+ : Intrinsic::umax,
+ MM->getLHS(), MM->getRHS()));
+ MM->eraseFromParent();
+ return true;
+ }
+
+ return false;
}
// Rewrite this with.overflow intrinsic as non-overflowing.
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 1bf50d7..851eab0 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1221,6 +1221,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
Value::use_iterator I, E, Next;
for (I = V->use_begin(), E = V->use_end(); I != E;) {
Use &U = *I;
+ User *CurUser = U.getUser();
// Some users may see the same pointer operand in multiple operands. Skip
// to the next instruction.
@@ -1231,11 +1232,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
// If V is used as the pointer operand of a compatible memory operation,
// sets the pointer operand to NewV. This replacement does not change
// the element type, so the resultant load/store is still valid.
- U.set(NewV);
+ CurUser->replaceUsesOfWith(V, NewV);
continue;
}
- User *CurUser = U.getUser();
// Skip if the current user is the new value itself.
if (CurUser == NewV)
continue;
@@ -1311,10 +1311,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
while (isa<PHINode>(InsertPos))
++InsertPos;
- U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ // This instruction may contain multiple uses of V, update them all.
+ CurUser->replaceUsesOfWith(
+ V, new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
} else {
- U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
- V->getType()));
+ CurUser->replaceUsesOfWith(
+ V, ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
}
}
}
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 627c863..08021f3b 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7033,7 +7033,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
// SCEVExpander for both use in preheader and latch
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
- SCEVExpanderCleaner ExpCleaner(Expander);
assert(Expander.isSafeToExpand(TermValueS) &&
"Terminating value was checked safe in canFoldTerminatingCondition");
@@ -7064,10 +7063,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
BI->setCondition(NewTermCond);
+ Expander.clear();
OldTermCond->eraseFromParent();
DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
-
- ExpCleaner.markResultUsed();
}
}
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f4f3070..260f31b 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -291,9 +291,9 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
InstructionCost TotalSpeculationCost = 0;
unsigned NotHoistedInstCount = 0;
for (const auto &I : FromBlock) {
- // Make note of any DPValues that need hoisting.
- for (DbgRecord &DR : I.getDbgValueRange()) {
- DPValue &DPV = cast<DPValue>(DR);
+ // Make note of any DPValues that need hoisting. DPLabels
+ // get left behind just like llvm.dbg.labels.
+ for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
if (HasNoUnhoistedInstr(DPV.location_ops()))
DPValuesToHoist[DPV.getInstruction()].push_back(&DPV);
}
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7fd6759..5bb109a 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -386,7 +386,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
SmallVector<DPValue *, 8> ToBeRemoved;
SmallDenseSet<DebugVariable> VariableSet;
for (auto &I : reverse(*BB)) {
- for (DPValue &DPV : reverse(DPValue::filter(I.getDbgValueRange()))) {
+ for (DbgRecord &DR : reverse(I.getDbgValueRange())) {
+ if (isa<DPLabel>(DR)) {
+ // Emulate existing behaviour (see comment below for dbg.declares).
+ // FIXME: Don't do this.
+ VariableSet.clear();
+ continue;
+ }
+
+ DPValue &DPV = cast<DPValue>(DR);
// Skip declare-type records, as the debug intrinsic method only works
// on dbg.value intrinsics.
if (DPV.getType() == DPValue::LocationType::Declare) {
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 8ebcf0c..bab0651 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1585,8 +1585,30 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
return cast<DILocalVariable>(NewVar);
};
- auto UpdateDPValuesOnInst = [&](Instruction &I) -> void {
- for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
+ auto UpdateDbgLabel = [&](auto *LabelRecord) {
+ // Point the label record to a fresh label within the new function if
+ // the record was not inlined from some other function.
+ if (LabelRecord->getDebugLoc().getInlinedAt())
+ return;
+ DILabel *OldLabel = LabelRecord->getLabel();
+ DINode *&NewLabel = RemappedMetadata[OldLabel];
+ if (!NewLabel) {
+ DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
+ *OldLabel->getScope(), *NewSP, Ctx, Cache);
+ NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
+ OldLabel->getFile(), OldLabel->getLine());
+ }
+ LabelRecord->setLabel(cast<DILabel>(NewLabel));
+ };
+
+ auto UpdateDbgRecordsOnInst = [&](Instruction &I) -> void {
+ for (DbgRecord &DR : I.getDbgValueRange()) {
+ if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+ UpdateDbgLabel(DPL);
+ continue;
+ }
+
+ DPValue &DPV = cast<DPValue>(DR);
// Apply the two updates that dbg.values get: invalid operands, and
// variable metadata fixup.
if (any_of(DPV.location_ops(), IsInvalidLocation)) {
@@ -1599,13 +1621,11 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
}
if (!DPV.getDebugLoc().getInlinedAt())
DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
- DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(),
- *NewSP, Ctx, Cache));
}
};
for (Instruction &I : instructions(NewFunc)) {
- UpdateDPValuesOnInst(I);
+ UpdateDbgRecordsOnInst(I);
auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
if (!DII)
@@ -1614,17 +1634,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
// Point the intrinsic to a fresh label within the new function if the
// intrinsic was not inlined from some other function.
if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
- if (DLI->getDebugLoc().getInlinedAt())
- continue;
- DILabel *OldLabel = DLI->getLabel();
- DINode *&NewLabel = RemappedMetadata[OldLabel];
- if (!NewLabel) {
- DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
- *OldLabel->getScope(), *NewSP, Ctx, Cache);
- NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
- OldLabel->getFile(), OldLabel->getLine());
- }
- DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+ UpdateDbgLabel(DLI);
continue;
}
@@ -1658,6 +1668,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
if (const DebugLoc &DL = I.getDebugLoc())
I.setDebugLoc(
DebugLoc::replaceInlinedAtSubprogram(DL, *NewSP, Ctx, Cache));
+ for (DbgRecord &DR : I.getDbgValueRange())
+ DR.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DR.getDebugLoc(),
+ *NewSP, Ctx, Cache));
// Loop info metadata may contain line locations. Fix them up.
auto updateLoopInfoLoc = [&Ctx, &Cache, NewSP](Metadata *MD) -> Metadata * {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 08fdd3b..2ff7c01 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -111,8 +111,7 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
void StackInfoBuilder::visit(Instruction &Inst) {
// Visit non-intrinsic debug-info records attached to Inst.
- for (DbgRecord &DR : Inst.getDbgValueRange()) {
- DPValue &DPV = cast<DPValue>(DR);
+ for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
auto AddIfInteresting = [&](Value *V) {
if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
if (!isInterestingAlloca(*AI))
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 6e46469..91ab279 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -538,6 +538,11 @@ Value *Mapper::mapValue(const Value *V) {
}
void Mapper::remapDPValue(DbgRecord &DR) {
+ if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+ DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
+ return;
+ }
+
DPValue &V = cast<DPValue>(DR);
// Remap variables and DILocations.
auto *MappedVar = mapMetadata(V.getVariable());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4e33474..de4e56f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2422,18 +2422,25 @@ private:
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
/// permutations. Must form single-register vector.
+ /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+ /// commands to build the mask using the original vector value, without
+ /// relying on the potential reordering.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
+ bool ForOrder);
/// Checks if the gathered \p VL can be represented as multi-register
/// shuffle(s) of previous tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
/// permutations.
+ /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+ /// commands to build the mask using the original vector value, without
+ /// relying on the potential reordering.
/// \returns per-register series of ShuffleKind, if gathered values can be
/// represented as shuffles of previous tree entries. \p Mask is filled with
/// the shuffle mask (also on per-register base).
@@ -2441,7 +2448,7 @@ private:
isGatherShuffledEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts);
+ unsigned NumParts, bool ForOrder = false);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
std::optional<BoUpSLP::OrdersType>
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
- unsigned NumScalars = TE.Scalars.size();
+ // Try to find subvector extract/insert patterns and reorder only such
+ // patterns.
+ SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
+ Type *ScalarTy = GatheredScalars.front()->getType();
+ int NumScalars = GatheredScalars.size();
+ if (!isValidElementType(ScalarTy))
+ return std::nullopt;
+ auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
+ int NumParts = TTI->getNumberOfParts(VecTy);
+ if (NumParts == 0 || NumParts >= NumScalars)
+ NumParts = 1;
+ SmallVector<int> ExtractMask;
+ SmallVector<int> Mask;
+ SmallVector<SmallVector<const TreeEntry *>> Entries;
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
+ isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
+ /*ForOrder=*/true);
+ // No shuffled operands - ignore.
+ if (GatherShuffles.empty() && ExtractShuffles.empty())
+ return std::nullopt;
OrdersType CurrentOrder(NumScalars, NumScalars);
- SmallVector<int> Positions;
- SmallBitVector UsedPositions(NumScalars);
- const TreeEntry *STE = nullptr;
- // Try to find all gathered scalars that are gets vectorized in other
- // vectorize node. Here we can have only one single tree vector node to
- // correctly identify order of the gathered scalars.
- for (unsigned I = 0; I < NumScalars; ++I) {
- Value *V = TE.Scalars[I];
- if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
- continue;
- if (const auto *LocalSTE = getTreeEntry(V)) {
- if (!STE)
- STE = LocalSTE;
- else if (STE != LocalSTE)
- // Take the order only from the single vector node.
- return std::nullopt;
- unsigned Lane =
- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
- if (Lane >= NumScalars)
- return std::nullopt;
- if (CurrentOrder[Lane] != NumScalars) {
- if (Lane != I)
+ if (GatherShuffles.size() == 1 &&
+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+ Entries.front().front()->isSame(TE.Scalars)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
+ return CurrentOrder;
+ }
+ auto IsSplatMask = [](ArrayRef<int> Mask) {
+ int SingleElt = PoisonMaskElem;
+ return all_of(Mask, [&](int I) {
+ if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
+ SingleElt = I;
+ return I == PoisonMaskElem || I == SingleElt;
+ });
+ };
+ // Exclusive broadcast mask - ignore.
+ if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
+ (Entries.size() != 1 ||
+ Entries.front().front()->ReorderIndices.empty())) ||
+ (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
+ return std::nullopt;
+ SmallBitVector ShuffledSubMasks(NumParts);
+ auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
+ ArrayRef<int> Mask, int PartSz, int NumParts,
+ function_ref<unsigned(unsigned)> GetVF) {
+ for (int I : seq<int>(0, NumParts)) {
+ if (ShuffledSubMasks.test(I))
+ continue;
+ const int VF = GetVF(I);
+ if (VF == 0)
+ continue;
+ MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+ // Shuffle of at least 2 vectors - ignore.
+ if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
+ continue;
+ }
+ // Try to include as much elements from the mask as possible.
+ int FirstMin = INT_MAX;
+ int SecondVecFound = false;
+ for (int K : seq<int>(0, PartSz)) {
+ int Idx = Mask[I * PartSz + K];
+ if (Idx == PoisonMaskElem) {
+ Value *V = GatheredScalars[I * PartSz + K];
+ if (isConstant(V) && !isa<PoisonValue>(V)) {
+ SecondVecFound = true;
+ break;
+ }
continue;
- UsedPositions.reset(CurrentOrder[Lane]);
+ }
+ if (Idx < VF) {
+ if (FirstMin > Idx)
+ FirstMin = Idx;
+ } else {
+ SecondVecFound = true;
+ break;
+ }
}
- // The partial identity (where only some elements of the gather node are
- // in the identity order) is good.
- CurrentOrder[Lane] = I;
- UsedPositions.set(I);
- }
- }
- // Need to keep the order if we have a vector entry and at least 2 scalars or
- // the vectorized entry has just 2 scalars.
- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
- for (unsigned I = 0; I < NumScalars; ++I)
- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
- return false;
- return true;
- };
- if (IsIdentityOrder(CurrentOrder))
- return OrdersType();
- auto *It = CurrentOrder.begin();
- for (unsigned I = 0; I < NumScalars;) {
- if (UsedPositions.test(I)) {
- ++I;
+ FirstMin = (FirstMin / PartSz) * PartSz;
+ // Shuffle of at least 2 vectors - ignore.
+ if (SecondVecFound) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
continue;
}
- if (*It == NumScalars) {
- *It = I;
- ++I;
+ for (int K : seq<int>(0, PartSz)) {
+ int Idx = Mask[I * PartSz + K];
+ if (Idx == PoisonMaskElem)
+ continue;
+ Idx -= FirstMin;
+ if (Idx >= PartSz) {
+ SecondVecFound = true;
+ break;
+ }
+ if (CurrentOrder[I * PartSz + Idx] >
+ static_cast<unsigned>(I * PartSz + K) &&
+ CurrentOrder[I * PartSz + Idx] !=
+ static_cast<unsigned>(I * PartSz + Idx))
+ CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
+ }
+ // Shuffle of at least 2 vectors - ignore.
+ if (SecondVecFound) {
+ std::fill(Slice.begin(), Slice.end(), NumScalars);
+ ShuffledSubMasks.set(I);
+ continue;
}
- ++It;
}
- return std::move(CurrentOrder);
+ };
+ int PartSz = NumScalars / NumParts;
+ if (!ExtractShuffles.empty())
+ TransformMaskToOrder(
+ CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
+ if (!ExtractShuffles[I])
+ return 0U;
+ unsigned VF = 0;
+ for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+ int K = I * PartSz + Idx;
+ if (ExtractMask[K] == PoisonMaskElem)
+ continue;
+ if (!TE.ReuseShuffleIndices.empty())
+ K = TE.ReuseShuffleIndices[K];
+ if (!TE.ReorderIndices.empty())
+ K = std::distance(TE.ReorderIndices.begin(),
+ find(TE.ReorderIndices, K));
+ auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
+ if (!EI)
+ continue;
+ VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
+ ->getElementCount()
+ .getKnownMinValue());
+ }
+ return VF;
+ });
+ // Check special corner case - single shuffle of the same entry.
+ if (GatherShuffles.size() == 1 && NumParts != 1) {
+ if (ShuffledSubMasks.any())
+ return std::nullopt;
+ PartSz = NumScalars;
+ NumParts = 1;
}
- return std::nullopt;
+ if (!Entries.empty())
+ TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
+ if (!GatherShuffles[I])
+ return 0U;
+ return std::max(Entries[I].front()->getVectorFactor(),
+ Entries[I].back()->getVectorFactor());
+ });
+ int NumUndefs =
+ count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
+ if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
+ return std::nullopt;
+ return std::move(CurrentOrder);
}
namespace {
@@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
// element 3 is used twice in the second submask.
unsigned Sz = TE.Scalars.size();
- if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
- Sz))
+ if (TE.State == TreeEntry::NeedToGather) {
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE)) {
+ SmallVector<int> Mask;
+ fixupOrderingIndices(*CurrentOrder);
+ inversePermutation(*CurrentOrder, Mask);
+ ::addMask(Mask, TE.ReuseShuffleIndices);
+ OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
+ unsigned Sz = TE.Scalars.size();
+ for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
+ for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
+ if (Idx != PoisonMaskElem)
+ Res[Idx + K * Sz] = I + K * Sz;
+ }
+ return std::move(Res);
+ }
+ }
+ if (Sz == 2 && TE.getVectorFactor() == 4 &&
+ TTI->getNumberOfParts(FixedVectorType::get(
+ TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
return std::nullopt;
+ if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+ Sz)) {
+ SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
+ if (TE.ReorderIndices.empty())
+ std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
+ else
+ inversePermutation(TE.ReorderIndices, ReorderMask);
+ ::addMask(ReorderMask, TE.ReuseShuffleIndices);
+ unsigned VF = ReorderMask.size();
+ OrdersType ResOrder(VF, VF);
+ unsigned NumParts = VF / Sz;
+ SmallBitVector UsedVals(NumParts);
+ for (unsigned I = 0; I < VF; I += Sz) {
+ int Val = PoisonMaskElem;
+ unsigned UndefCnt = 0;
+ if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+ [&](int Idx) {
+ if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
+ Val = Idx;
+ if (Idx == PoisonMaskElem)
+ ++UndefCnt;
+ return Idx != PoisonMaskElem && Idx != Val;
+ }) ||
+ Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
+ UndefCnt > Sz / 2)
+ return std::nullopt;
+ UsedVals.set(Val);
+ for (unsigned K = 0; K < NumParts; ++K)
+ ResOrder[Val + Sz * K] = I + K;
+ }
+ return std::move(ResOrder);
+ }
unsigned VF = TE.getVectorFactor();
// Try build correct order for extractelement instructions.
SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
@@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
std::advance(It, Sz);
}
- if (all_of(enumerate(ResOrder),
+ if (TE.State == TreeEntry::NeedToGather &&
+ all_of(enumerate(ResOrder),
[](const auto &Data) { return Data.index() == Data.value(); }))
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
@@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
/*ResizeAllowed=*/true);
- if (Reuse || !CurrentOrder.empty()) {
- if (!CurrentOrder.empty())
- fixupOrderingIndices(CurrentOrder);
+ if (Reuse || !CurrentOrder.empty())
return std::move(CurrentOrder);
- }
}
// If the gather node is <undef, v, .., poison> and
// insertelement poison, v, 0 [+ permute]
@@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
PoisonValue::get(Ty), *It);
- if (InsertFirstCost + PermuteCost < InsertIdxCost)
+ if (InsertFirstCost + PermuteCost < InsertIdxCost) {
+ OrdersType Order(Sz, Sz);
+ Order[Idx] = 0;
return std::move(Order);
+ }
}
}
if (isSplat(TE.Scalars))
@@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
std::iota(It, std::next(It, Sz), 0);
}
+static void combineOrders(MutableArrayRef<unsigned> Order,
+ ArrayRef<unsigned> SecondaryOrder) {
+ assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
+ "Expected same size of orders");
+ unsigned Sz = Order.size();
+ SmallBitVector UsedIndices(Sz);
+ for (unsigned Idx : seq<unsigned>(0, Sz)) {
+ if (Order[Idx] != Sz)
+ UsedIndices.set(Order[Idx]);
+ }
+ if (SecondaryOrder.empty()) {
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Order[Idx] == Sz && !UsedIndices.test(Idx))
+ Order[Idx] = Idx;
+ } else {
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
+ !UsedIndices.test(SecondaryOrder[Idx]))
+ Order[Idx] = SecondaryOrder[Idx];
+ }
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() {
}
if (OrdersUses.empty())
continue;
+ auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+ const unsigned Sz = Order.size();
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Idx != Order[Idx] && Order[Idx] != Sz)
+ return false;
+ return true;
+ };
// Choose the most used order.
- ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
- unsigned Cnt = OrdersUses.front().second;
- for (const auto &Pair : drop_begin(OrdersUses)) {
- if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ unsigned IdentityCnt = 0;
+ unsigned FilledIdentityCnt = 0;
+ OrdersType IdentityOrder(VF, VF);
+ for (auto &Pair : OrdersUses) {
+ if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+ if (!Pair.first.empty())
+ FilledIdentityCnt += Pair.second;
+ IdentityCnt += Pair.second;
+ combineOrders(IdentityOrder, Pair.first);
+ }
+ }
+ MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+ unsigned Cnt = IdentityCnt;
+ for (auto &Pair : OrdersUses) {
+ // Prefer identity order. But, if filled identity found (non-empty order)
+ // with same number of uses, as the new candidate order, we can choose
+ // this candidate order.
+ if (Cnt < Pair.second ||
+ (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
+ Cnt == Pair.second && !BestOrder.empty() &&
+ IsIdentityOrder(BestOrder))) {
+ combineOrders(Pair.first, BestOrder);
BestOrder = Pair.first;
Cnt = Pair.second;
+ } else {
+ combineOrders(BestOrder, Pair.first);
}
}
// Set order of the user node.
- if (BestOrder.empty())
+ if (IsIdentityOrder(BestOrder))
continue;
+ fixupOrderingIndices(BestOrder);
SmallVector<int> Mask;
inversePermutation(BestOrder, Mask);
SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
@@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands(
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SetVector<TreeEntry *> OrderedEntries;
- DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+ DenseSet<const TreeEntry *> GathersToOrders;
// Find all reorderable leaf nodes with the given VF.
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
@@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
- GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ GathersToOrders.insert(TE.get());
}
}
@@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
- GathersToOrders.count(TE))) ||
+ GathersToOrders.contains(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
!all_of(drop_begin(TE->UserTreeIndices),
[TE](const EdgeInfo &EI) {
@@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const auto Order = [&]() -> const OrdersType {
if (OpTE->State == TreeEntry::NeedToGather ||
!OpTE->ReuseShuffleIndices.empty())
- return GathersToOrders.find(OpTE)->second;
+ return getReorderingData(*OpTE, /*TopToBottom=*/false)
+ .value_or(OrdersType(1));
return OpTE->ReorderIndices;
}();
+ // The order is partially ordered, skip it in favor of fully non-ordered
+ // orders.
+ if (Order.size() == 1)
+ continue;
unsigned NumOps = count_if(
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
@@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
(IgnoreReorder && TE->Idx == 0))
return true;
if (TE->State == TreeEntry::NeedToGather) {
- auto It = GathersToOrders.find(TE);
- if (It != GathersToOrders.end())
- return !It->second.empty();
+ if (GathersToOrders.contains(TE))
+ return !getReorderingData(*TE, /*TopToBottom=*/false)
+ .value_or(OrdersType(1))
+ .empty();
return true;
}
return false;
@@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
++Res.first->second;
}
}
- // Choose the best order.
- ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
- unsigned Cnt = OrdersUses.front().second;
- for (const auto &Pair : drop_begin(OrdersUses)) {
- if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+ if (OrdersUses.empty()) {
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
+ continue;
+ }
+ auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+ const unsigned Sz = Order.size();
+ for (unsigned Idx : seq<unsigned>(0, Sz))
+ if (Idx != Order[Idx] && Order[Idx] != Sz)
+ return false;
+ return true;
+ };
+ // Choose the most used order.
+ unsigned IdentityCnt = 0;
+ unsigned VF = Data.second.front().second->getVectorFactor();
+ OrdersType IdentityOrder(VF, VF);
+ for (auto &Pair : OrdersUses) {
+ if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+ IdentityCnt += Pair.second;
+ combineOrders(IdentityOrder, Pair.first);
+ }
+ }
+ MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+ unsigned Cnt = IdentityCnt;
+ for (auto &Pair : OrdersUses) {
+ // Prefer identity order. But, if filled identity found (non-empty
+ // order) with same number of uses, as the new candidate order, we can
+ // choose this candidate order.
+ if (Cnt < Pair.second) {
+ combineOrders(Pair.first, BestOrder);
BestOrder = Pair.first;
Cnt = Pair.second;
+ } else {
+ combineOrders(BestOrder, Pair.first);
}
}
- // Set order of the user node (reordering of operands and user nodes).
- if (BestOrder.empty()) {
+ // Set order of the user node.
+ if (IsIdentityOrder(BestOrder)) {
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
OrderedEntries.remove(Op.second);
continue;
}
+ fixupOrderingIndices(BestOrder);
// Erase operands from OrderedEntries list and adjust their orders.
VisitedOps.clear();
SmallVector<int> Mask;
@@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
V1 = Constant::getNullValue(
FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+ // Not identity/broadcast? Try to see if the original vector is better.
+ if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
+ CommonVF == CommonMask.size() &&
+ any_of(enumerate(CommonMask),
+ [](const auto &&P) {
+ return P.value() != PoisonMaskElem &&
+ static_cast<unsigned>(P.value()) != P.index();
+ }) &&
+ any_of(CommonMask,
+ [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
+ SmallVector<int> ReorderMask;
+ inversePermutation(E->ReorderIndices, ReorderMask);
+ ::addMask(CommonMask, ReorderMask);
+ }
} else if (V1 && P2.isNull()) {
// Shuffle single vector.
CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
std::optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
Entries.clear();
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
@@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
VToTEs.insert(TEPtr);
}
if (const TreeEntry *VTE = getTreeEntry(V)) {
+ if (ForOrder) {
+ if (VTE->State != TreeEntry::Vectorize) {
+ auto It = MultiNodeScalars.find(V);
+ if (It == MultiNodeScalars.end())
+ continue;
+ VTE = *It->getSecond().begin();
+ // Iterate through all vectorized nodes.
+ auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+ return MTE->State == TreeEntry::Vectorize;
+ });
+ if (MIt == It->getSecond().end())
+ continue;
+ VTE = *MIt;
+ }
+ }
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
@@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
unsigned Idx = Part * VL.size() + Pair.second;
- Mask[Idx] = Pair.first * VF +
- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ Mask[Idx] =
+ Pair.first * VF +
+ (ForOrder ? std::distance(
+ Entries[Pair.first]->Scalars.begin(),
+ find(Entries[Pair.first]->Scalars, VL[Pair.second]))
+ : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
}
switch (Entries.size()) {
@@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
BoUpSLP::isGatherShuffledEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts) {
+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
+ bool ForOrder) {
assert(NumParts > 0 && NumParts < VL.size() &&
"Expected positive number of registers.");
Entries.clear();
@@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry(
ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
std::optional<TTI::ShuffleKind> SubRes =
- isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
+ ForOrder);
if (!SubRes)
SubEntries.clear();
Res.push_back(SubRes);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 240d4bd..a2a203c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -385,9 +385,6 @@ struct VPTransformState {
VPValue2ValueTy VPValue2Value;
- /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
- Value *CanonicalIV = nullptr;
-
/// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
InnerLoopVectorizer *ILV;