aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp171
-rw-r--r--llvm/lib/Analysis/IVDescriptors.cpp26
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp23
-rw-r--r--llvm/lib/Analysis/MemoryProfileInfo.cpp41
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp18
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp32
-rw-r--r--llvm/lib/BinaryFormat/CMakeLists.txt1
-rw-r--r--llvm/lib/BinaryFormat/SFrame.cpp37
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp22
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp3
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp32
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h11
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/WinException.cpp16
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/WinException.h1
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp66
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp20
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp98
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp3
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp24
-rw-r--r--llvm/lib/CodeGen/RegAllocBasic.cpp97
-rw-r--r--llvm/lib/CodeGen/RegAllocBasic.h104
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp43
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp50
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp9
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp26
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp47
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp33
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp15
-rw-r--r--llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp57
-rw-r--r--llvm/lib/IR/AsmWriter.cpp8
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp44
-rw-r--r--llvm/lib/IR/DebugInfo.cpp87
-rw-r--r--llvm/lib/IR/Function.cpp1
-rw-r--r--llvm/lib/IR/Metadata.cpp18
-rw-r--r--llvm/lib/IR/PassInstrumentation.cpp6
-rw-r--r--llvm/lib/IR/Value.cpp9
-rw-r--r--llvm/lib/IR/Verifier.cpp94
-rw-r--r--llvm/lib/MC/MCAssembler.cpp241
-rw-r--r--llvm/lib/MC/MCCodeView.cpp17
-rw-r--r--llvm/lib/MC/MCDwarf.cpp11
-rw-r--r--llvm/lib/MC/MCELFStreamer.cpp2
-rw-r--r--llvm/lib/MC/MCExpr.cpp9
-rw-r--r--llvm/lib/MC/MCFragment.cpp31
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp7
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp103
-rw-r--r--llvm/lib/MC/MCParser/AsmParser.cpp7
-rw-r--r--llvm/lib/MC/MCParser/MCTargetAsmParser.cpp7
-rw-r--r--llvm/lib/MC/MCPseudoProbe.cpp5
-rw-r--r--llvm/lib/MC/MCSection.cpp28
-rw-r--r--llvm/lib/MC/MCSectionCOFF.cpp4
-rw-r--r--llvm/lib/MC/MCSectionELF.cpp2
-rw-r--r--llvm/lib/MC/MCStreamer.cpp6
-rw-r--r--llvm/lib/MC/MCWin64EH.cpp11
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp40
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp2
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp23
-rw-r--r--llvm/lib/MC/WasmObjectWriter.cpp44
-rw-r--r--llvm/lib/MC/WinCOFFObjectWriter.cpp31
-rw-r--r--llvm/lib/ObjCopy/MachO/MachOObject.h4
-rw-r--r--llvm/lib/ObjCopy/MachO/MachOWriter.cpp4
-rw-r--r--llvm/lib/Object/CMakeLists.txt1
-rw-r--r--llvm/lib/Object/ELFObjectFile.cpp17
-rw-r--r--llvm/lib/Object/SFrameParser.cpp55
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp4
-rw-r--r--llvm/lib/Support/CMakeLists.txt4
-rw-r--r--llvm/lib/Support/StringMap.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td27
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp166
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp186
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp167
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td65
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp185
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp87
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td87
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td15
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td26
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td39
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td122
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td2
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp6
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/DirectX/DXILPrepare.cpp23
-rw-r--r--llvm/lib/Target/DirectX/DXILShaderFlags.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp36
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp226
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td14
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td17
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp117
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h9
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp10
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp11
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp43
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp43
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td218
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td40
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp73
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h9
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp19
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp53
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp19
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h15
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td37
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoV.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp67
-rw-r--r--llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp96
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp9
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp82
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp23
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h2
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp26
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp15
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp63
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp19
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp208
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp153
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroEarly.cpp6
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/SpillUtils.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp26
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp7
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp99
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp81
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp18
-rw-r--r--llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp253
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp21
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/LCSSA.cpp18
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp49
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/MemoryOpRemark.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp36
-rw-r--r--llvm/lib/Transforms/Utils/SSAUpdater.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/ValueMapper.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp64
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp163
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp60
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp107
220 files changed, 4926 insertions, 2069 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 9c1c2c6..e71ba5e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1801,6 +1801,44 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::nvvm_d2ull_rn:
case Intrinsic::nvvm_d2ull_rp:
case Intrinsic::nvvm_d2ull_rz:
+
+ // NVVM math intrinsics:
+ case Intrinsic::nvvm_ceil_d:
+ case Intrinsic::nvvm_ceil_f:
+ case Intrinsic::nvvm_ceil_ftz_f:
+
+ case Intrinsic::nvvm_fabs:
+ case Intrinsic::nvvm_fabs_ftz:
+
+ case Intrinsic::nvvm_floor_d:
+ case Intrinsic::nvvm_floor_f:
+ case Intrinsic::nvvm_floor_ftz_f:
+
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_f:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+
+ case Intrinsic::nvvm_round_d:
+ case Intrinsic::nvvm_round_f:
+ case Intrinsic::nvvm_round_ftz_f:
+
+ case Intrinsic::nvvm_saturate_d:
+ case Intrinsic::nvvm_saturate_f:
+ case Intrinsic::nvvm_saturate_ftz_f:
+
+ case Intrinsic::nvvm_sqrt_f:
+ case Intrinsic::nvvm_sqrt_rn_d:
+ case Intrinsic::nvvm_sqrt_rn_f:
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
return !Call->isStrictFP();
// Sign operations are actually bitwise operations, they do not raise
@@ -1818,6 +1856,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::nearbyint:
case Intrinsic::rint:
case Intrinsic::canonicalize:
+
// Constrained intrinsics can be folded if FP environment is known
// to compiler.
case Intrinsic::experimental_constrained_fma:
@@ -1965,22 +2004,56 @@ inline bool llvm_fenv_testexcept() {
return false;
}
-static APFloat FTZPreserveSign(const APFloat &V) {
+static const APFloat FTZPreserveSign(const APFloat &V) {
if (V.isDenormal())
return APFloat::getZero(V.getSemantics(), V.isNegative());
return V;
}
-Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
- Type *Ty) {
+static const APFloat FlushToPositiveZero(const APFloat &V) {
+ if (V.isDenormal())
+ return APFloat::getZero(V.getSemantics(), false);
+ return V;
+}
+
+static const APFloat
+FlushWithDenormKind(const APFloat &V,
+ DenormalMode::DenormalModeKind DenormKind) {
+ assert(DenormKind != DenormalMode::DenormalModeKind::Invalid &&
+ DenormKind != DenormalMode::DenormalModeKind::Dynamic);
+ switch (DenormKind) {
+ case DenormalMode::DenormalModeKind::IEEE:
+ return V;
+ case DenormalMode::DenormalModeKind::PreserveSign:
+ return FTZPreserveSign(V);
+ case DenormalMode::DenormalModeKind::PositiveZero:
+ return FlushToPositiveZero(V);
+ default:
+ llvm_unreachable("Invalid denormal mode!");
+ }
+}
+
+Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
+ DenormalMode DenormMode = DenormalMode::getIEEE()) {
+ if (!DenormMode.isValid() ||
+ DenormMode.Input == DenormalMode::DenormalModeKind::Dynamic ||
+ DenormMode.Output == DenormalMode::DenormalModeKind::Dynamic)
+ return nullptr;
+
llvm_fenv_clearexcept();
- double Result = NativeFP(V.convertToDouble());
+ auto Input = FlushWithDenormKind(V, DenormMode.Input);
+ double Result = NativeFP(Input.convertToDouble());
if (llvm_fenv_testexcept()) {
llvm_fenv_clearexcept();
return nullptr;
}
- return GetConstantFoldFPValue(Result, Ty);
+ Constant *Output = GetConstantFoldFPValue(Result, Ty);
+ if (DenormMode.Output == DenormalMode::DenormalModeKind::IEEE)
+ return Output;
+ const auto *CFP = static_cast<ConstantFP *>(Output);
+ const auto Res = FlushWithDenormKind(CFP->getValueAPF(), DenormMode.Output);
+ return ConstantFP::get(Ty->getContext(), Res);
}
#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
@@ -2550,6 +2623,94 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
return ConstantFoldFP(atan, APF, Ty);
case Intrinsic::sqrt:
return ConstantFoldFP(sqrt, APF, Ty);
+
+ // NVVM Intrinsics:
+ case Intrinsic::nvvm_ceil_ftz_f:
+ case Intrinsic::nvvm_ceil_f:
+ case Intrinsic::nvvm_ceil_d:
+ return ConstantFoldFP(
+ ceil, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_fabs_ftz:
+ case Intrinsic::nvvm_fabs:
+ return ConstantFoldFP(
+ fabs, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_floor_ftz_f:
+ case Intrinsic::nvvm_floor_f:
+ case Intrinsic::nvvm_floor_d:
+ return ConstantFoldFP(
+ floor, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_f: {
+ APFloat::roundingMode RoundMode = nvvm::GetRCPRoundingMode(IntrinsicID);
+ bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID);
+
+ auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF;
+ APFloat Res = APFloat::getOne(APF.getSemantics());
+ APFloat::opStatus Status = Res.divide(Denominator, RoundMode);
+
+ if (Status == APFloat::opOK || Status == APFloat::opInexact) {
+ if (IsFTZ)
+ Res = FTZPreserveSign(Res);
+ return ConstantFP::get(Ty->getContext(), Res);
+ }
+ return nullptr;
+ }
+
+ case Intrinsic::nvvm_round_ftz_f:
+ case Intrinsic::nvvm_round_f:
+ case Intrinsic::nvvm_round_d: {
+ // Use APFloat implementation instead of native libm call, as some
+ // implementations (e.g. on PPC) do not preserve the sign of negative 0.
+ bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+ auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+ V.roundToIntegral(APFloat::rmNearestTiesToAway);
+ return ConstantFP::get(Ty->getContext(), V);
+ }
+
+ case Intrinsic::nvvm_saturate_ftz_f:
+ case Intrinsic::nvvm_saturate_d:
+ case Intrinsic::nvvm_saturate_f: {
+ bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+ auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+ if (V.isNegative() || V.isZero() || V.isNaN())
+ return ConstantFP::getZero(Ty);
+ APFloat One = APFloat::getOne(APF.getSemantics());
+ if (V > One)
+ return ConstantFP::get(Ty->getContext(), One);
+ return ConstantFP::get(Ty->getContext(), APF);
+ }
+
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
+ case Intrinsic::nvvm_sqrt_f:
+ case Intrinsic::nvvm_sqrt_rn_d:
+ case Intrinsic::nvvm_sqrt_rn_f:
+ if (APF.isNegative())
+ return nullptr;
+ return ConstantFoldFP(
+ sqrt, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ // AMDGCN Intrinsics:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_sin: {
double V = getValueAsDouble(Op);
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 39f74be..8be5de3 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
};
- if (isIntMinMaxRecurrenceKind(Kind) ||
- (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
+ if (isIntMinMaxRecurrenceKind(Kind))
return isMinMaxPattern(I, Kind, Prev);
- else if (isFMulAddIntrinsic(I))
+ if (isFPMinMaxRecurrenceKind(Kind)) {
+ InstDesc Res = isMinMaxPattern(I, Kind, Prev);
+ if (!Res.isRecurrence())
+ return InstDesc(false, I);
+ if (HasRequiredFMF())
+ return Res;
+ // We may be able to vectorize FMax/FMin reductions using maxnum/minnum
+ // intrinsics with extra checks ensuring the vector loop handles only
+ // non-NaN inputs.
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
+ assert(Kind == RecurKind::FMax &&
+ "unexpected recurrence kind for maxnum");
+ return InstDesc(I, RecurKind::FMaxNum);
+ }
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
+ assert(Kind == RecurKind::FMin &&
+ "unexpected recurrence kind for minnum");
+ return InstDesc(I, RecurKind::FMinNum);
+ }
+ return InstDesc(false, I);
+ }
+ if (isFMulAddIntrinsic(I))
return InstDesc(Kind == RecurKind::FMulAdd, I,
I->hasAllowReassoc() ? nullptr : I);
return InstDesc(false, I);
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f3a32d3..14be385 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -589,11 +589,11 @@ void RuntimePointerChecking::groupChecks(
// dependence. Not grouping the checks for a[i] and a[i + 9000] allows
// us to perform an accurate check in this case.
//
- // The above case requires that we have an UnknownDependence between
- // accesses to the same underlying object. This cannot happen unless
- // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
- // is also false. In this case we will use the fallback path and create
- // separate checking groups for all pointers.
+ // In the above case, we have a non-constant distance and an Unknown
+ // dependence between accesses to the same underlying object, and could retry
+ // with runtime checks. Therefore UseDependencies is false. In this case we
+ // will use the fallback path and create separate checking groups for all
+ // pointers.
// If we don't have the dependency partitions, construct a new
// checking pointer group for each pointer. This is also required
@@ -819,7 +819,7 @@ public:
/// perform dependency checking.
///
/// Note that this can later be cleared if we retry memcheck analysis without
- /// dependency checking (i.e. FoundNonConstantDistanceDependence).
+ /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
/// We decided that no dependence analysis would be used. Reset the state.
@@ -896,7 +896,7 @@ private:
///
/// Note that, this is different from isDependencyCheckNeeded. When we retry
/// memcheck analysis without dependency checking
- /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
+ /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
/// cleared while this remains set if we have potentially dependent accesses.
bool IsRTCheckAnalysisNeeded = false;
@@ -2079,11 +2079,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
if (StrideAScaled == StrideBScaled)
CommonStride = StrideAScaled;
- // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
- // to consider retrying with runtime checks. Historically, we did not set it
- // when (unscaled) strides were different but there is no inherent reason to.
+ // TODO: Historically, we didn't retry with runtime checks when (unscaled)
+ // strides were different but there is no inherent reason to.
if (!isa<SCEVConstant>(Dist))
- FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt;
+ ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
// If distance is a SCEVCouldNotCompute, return Unknown immediately.
if (isa<SCEVCouldNotCompute>(Dist)) {
@@ -2712,7 +2711,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
DepsAreSafe =
DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck());
- if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
+ if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
// Clear the dependency checks. We assume they are not needed.
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index c08024a..b3c8a7d 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -157,6 +157,8 @@ void CallStackTrie::addCallStack(
}
void CallStackTrie::addCallStack(MDNode *MIB) {
+ // Note that we are building this from existing MD_memprof metadata.
+ BuiltFromExistingMetadata = true;
MDNode *StackMD = getMIBStackNode(MIB);
assert(StackMD);
std::vector<uint64_t> CallStack;
@@ -187,8 +189,9 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
AllocationType AllocType,
ArrayRef<ContextTotalSize> ContextSizeInfo,
- const uint64_t MaxColdSize, uint64_t &TotalBytes,
- uint64_t &ColdBytes) {
+ const uint64_t MaxColdSize,
+ bool BuiltFromExistingMetadata,
+ uint64_t &TotalBytes, uint64_t &ColdBytes) {
SmallVector<Metadata *> MIBPayload(
{buildCallstackMetadata(MIBCallStack, Ctx)});
MIBPayload.push_back(
@@ -197,8 +200,9 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
if (ContextSizeInfo.empty()) {
// The profile matcher should have provided context size info if there was a
// MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully
- // handle a user-provided percent larger than 100.
- assert(MinCallsiteColdBytePercent >= 100);
+ // handle a user-provided percent larger than 100. However, we may not have
+ // this information if we built the Trie from existing MD_memprof metadata.
+ assert(BuiltFromExistingMetadata || MinCallsiteColdBytePercent >= 100);
return MDNode::get(Ctx, MIBPayload);
}
@@ -252,9 +256,19 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
std::vector<Metadata *> &SavedMIBNodes,
unsigned CallerContextLength,
- uint64_t TotalBytes, uint64_t ColdBytes) {
+ uint64_t TotalBytes, uint64_t ColdBytes,
+ bool BuiltFromExistingMetadata) {
const bool MostlyCold =
- MinCallsiteColdBytePercent < 100 &&
+ // If we have built the Trie from existing MD_memprof metadata, we may or
+ // may not have context size information (in which case ColdBytes and
+ // TotalBytes are 0, which is not also guarded against below). Even if we
+ // do have some context size information from the the metadata, we have
+ // already gone through a round of discarding of small non-cold contexts
+ // during matching, and it would be overly aggressive to do it again, and
+ // we also want to maintain the same behavior with and without reporting
+ // of hinted bytes enabled.
+ !BuiltFromExistingMetadata && MinCallsiteColdBytePercent < 100 &&
+ ColdBytes > 0 &&
ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes;
// In the simplest case, with pruning disabled, keep all the new MIB nodes.
@@ -386,9 +400,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
if (hasSingleAllocType(Node->AllocTypes)) {
std::vector<ContextTotalSize> ContextSizeInfo;
collectContextSizeInfo(Node, ContextSizeInfo);
- MIBNodes.push_back(
- createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes,
- ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes));
+ MIBNodes.push_back(createMIBNode(
+ Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo,
+ MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes));
return true;
}
@@ -416,7 +430,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
// Pass in the stack length of the MIB nodes added for the immediate caller,
// which is the current stack length plus 1.
saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1,
- CallerTotalBytes, CallerColdBytes);
+ CallerTotalBytes, CallerColdBytes,
+ BuiltFromExistingMetadata);
TotalBytes += CallerTotalBytes;
ColdBytes += CallerColdBytes;
@@ -441,9 +456,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
return false;
std::vector<ContextTotalSize> ContextSizeInfo;
collectContextSizeInfo(Node, ContextSizeInfo);
- MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
- ContextSizeInfo, MaxColdSize, TotalBytes,
- ColdBytes));
+ MIBNodes.push_back(createMIBNode(
+ Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize,
+ BuiltFromExistingMetadata, TotalBytes, ColdBytes));
return true;
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 61a322b..af85ce4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7912,6 +7912,8 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
case Intrinsic::ushl_sat:
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
+ case Intrinsic::umul_fix:
+ case Intrinsic::umul_fix_sat:
case Intrinsic::pow:
case Intrinsic::powi:
case Intrinsic::sin:
@@ -7928,6 +7930,22 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
case Intrinsic::atan2:
case Intrinsic::canonicalize:
case Intrinsic::sqrt:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::exp10:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::modf:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::round:
+ case Intrinsic::roundeven:
+ case Intrinsic::lrint:
+ case Intrinsic::llrint:
return true;
default:
return false;
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ce813e1..520c6a0 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_cs_chain_preserve);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
+ KEYWORD(amdgpu_gfx_whole_wave);
KEYWORD(tailcc);
KEYWORD(m68k_rtdcc);
KEYWORD(graalcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7f6950..13bef1f 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
CC = CallingConv::AMDGPU_CS_ChainPreserve;
break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
+ case lltok::kw_amdgpu_gfx_whole_wave:
+ CC = CallingConv::AMDGPU_Gfx_WholeWave;
+ break;
case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break;
case lltok::kw_graalcc: CC = CallingConv::GRAAL; break;
@@ -4783,9 +4786,13 @@ struct MDField : public MDFieldImpl<Metadata *> {
};
struct MDStringField : public MDFieldImpl<MDString *> {
- bool AllowEmpty;
- MDStringField(bool AllowEmpty = true)
- : ImplTy(nullptr), AllowEmpty(AllowEmpty) {}
+ enum class EmptyIs {
+ Null, //< Allow empty input string, map to nullptr
+ Empty, //< Allow empty input string, map to an empty MDString
+ Error, //< Disallow empty string, map to an error
+ } EmptyIs;
+ MDStringField(enum EmptyIs EmptyIs = EmptyIs::Null)
+ : ImplTy(nullptr), EmptyIs(EmptyIs) {}
};
struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> {
@@ -5257,10 +5264,19 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
if (parseStringConstant(S))
return true;
- if (!Result.AllowEmpty && S.empty())
- return error(ValueLoc, "'" + Name + "' cannot be empty");
+ if (S.empty()) {
+ switch (Result.EmptyIs) {
+ case MDStringField::EmptyIs::Null:
+ Result.assign(nullptr);
+ return false;
+ case MDStringField::EmptyIs::Empty:
+ break;
+ case MDStringField::EmptyIs::Error:
+ return error(ValueLoc, "'" + Name + "' cannot be empty");
+ }
+ }
- Result.assign(S.empty() ? nullptr : MDString::get(Context, S));
+ Result.assign(MDString::get(Context, S));
return false;
}
@@ -5778,7 +5794,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
REQUIRED(directory, MDStringField, ); \
OPTIONAL(checksumkind, ChecksumKindField, (DIFile::CSK_MD5)); \
OPTIONAL(checksum, MDStringField, ); \
- OPTIONAL(source, MDStringField, );
+ OPTIONAL(source, MDStringField, (MDStringField::EmptyIs::Empty));
PARSE_MD_FIELDS();
#undef VISIT_MD_FIELDS
@@ -6062,7 +6078,7 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
/// declaration: !4, align: 8)
bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \
- OPTIONAL(name, MDStringField, (/* AllowEmpty */ false)); \
+ OPTIONAL(name, MDStringField, (MDStringField::EmptyIs::Error)); \
OPTIONAL(scope, MDField, ); \
OPTIONAL(linkageName, MDStringField, ); \
OPTIONAL(file, MDField, ); \
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 38ba2d9..4b2debb 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMBinaryFormat
MsgPackDocumentYAML.cpp
MsgPackReader.cpp
MsgPackWriter.cpp
+ SFrame.cpp
Wasm.cpp
XCOFF.cpp
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
new file mode 100644
index 0000000..3b436af
--- /dev/null
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -0,0 +1,37 @@
+//===-- SFrame.cpp -----------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+
+ArrayRef<EnumEntry<sframe::Version>> sframe::getVersions() {
+ static constexpr EnumEntry<Version> Versions[] = {
+#define HANDLE_SFRAME_VERSION(CODE, NAME) {#NAME, sframe::Version::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+
+ return ArrayRef(Versions);
+}
+
+ArrayRef<EnumEntry<sframe::Flags>> sframe::getFlags() {
+ static constexpr EnumEntry<sframe::Flags> Flags[] = {
+#define HANDLE_SFRAME_FLAG(CODE, NAME) {#NAME, sframe::Flags::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(Flags);
+}
+
+ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() {
+ static constexpr EnumEntry<sframe::ABI> ABIs[] = {
+#define HANDLE_SFRAME_ABI(CODE, NAME) {#NAME, sframe::ABI::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(ABIs);
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 66ecc69..f763683 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -293,10 +293,18 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
std::string S;
if (convertToString(Record, 0, S))
return error("Invalid section name record");
+
// Check for the i386 and other (x86_64, ARM) conventions
- if (S.find("__DATA,__objc_catlist") != std::string::npos ||
- S.find("__OBJC,__category") != std::string::npos ||
- S.find("__TEXT,__swift") != std::string::npos)
+
+ auto [Segment, Section] = StringRef(S).split(",");
+ Segment = Segment.trim();
+ Section = Section.trim();
+
+ if (Segment == "__DATA" && Section.starts_with("__objc_catlist"))
+ return true;
+ if (Segment == "__OBJC" && Section.starts_with("__category"))
+ return true;
+ if (Segment == "__TEXT" && Section.starts_with("__swift"))
return true;
break;
}
@@ -7116,9 +7124,11 @@ Error BitcodeReader::materializeModule() {
if (CallInst *CI = dyn_cast<CallInst>(U))
UpgradeIntrinsicCall(CI, I.second);
}
- if (!I.first->use_empty())
- I.first->replaceAllUsesWith(I.second);
- I.first->eraseFromParent();
+ if (I.first != I.second) {
+ if (!I.first->use_empty())
+ I.first->replaceAllUsesWith(I.second);
+ I.first->eraseFromParent();
+ }
}
UpgradedIntrinsics.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 76a1d8c..f1d3e96 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -809,7 +809,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
// If we have a bss global going to a section that supports the
// zerofill directive, do so here.
- if (GVKind.isBSS() && MAI->isMachO() && TheSection->isVirtualSection()) {
+ if (GVKind.isBSS() && MAI->isMachO() && TheSection->isBssSection()) {
if (Size == 0)
Size = 1; // zerofill of 0 bytes is undefined.
emitLinkage(GV, GVSym);
@@ -1868,6 +1868,7 @@ void AsmPrinter::emitFunctionBody() {
OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
break;
case TargetOpcode::EH_LABEL:
+ OutStreamer->AddComment("EH_LABEL");
OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
// For AsynchEH, insert a Nop if followed by a trap inst
// Or the exception won't be caught.
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index 618deef..4bf3bdf 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -18,6 +18,11 @@
#include "llvm/MC/MCPseudoProbe.h"
#include "llvm/MC/MCStreamer.h"
+#ifndef NDEBUG
+#include "llvm/IR/Module.h"
+#include "llvm/Support/WithColor.h"
+#endif
+
using namespace llvm;
void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
@@ -35,6 +40,9 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
uint64_t &CallerGuid = NameGuidMap[Name];
if (!CallerGuid)
CallerGuid = Function::getGUIDAssumingExternalLinkage(Name);
+#ifndef NDEBUG
+ verifyGuidExistenceInDesc(CallerGuid, Name);
+#endif
uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
InlinedAt->getDiscriminator());
ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
@@ -51,4 +59,28 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack));
Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator,
InlineStack, Asm->CurrentFnSym);
+#ifndef NDEBUG
+ verifyGuidExistenceInDesc(
+ Guid, DebugLoc ? DebugLoc->getSubprogramLinkageName() : "");
+#endif
+}
+
+#ifndef NDEBUG
+void PseudoProbeHandler::verifyGuidExistenceInDesc(uint64_t Guid,
+ StringRef FuncName) {
+ NamedMDNode *Desc = Asm->MF->getFunction().getParent()->getNamedMetadata(
+ PseudoProbeDescMetadataName);
+ assert(Desc && "pseudo probe does not exist");
+
+ // Keep DescGuidSet up to date.
+ for (size_t I = DescGuidSet.size(), E = Desc->getNumOperands(); I != E; ++I) {
+ const auto *MD = cast<MDNode>(Desc->getOperand(I));
+ auto *ID = mdconst::extract<ConstantInt>(MD->getOperand(0));
+ DescGuidSet.insert(ID->getZExtValue());
+ }
+
+ if (!DescGuidSet.contains(Guid))
+ WithColor::warning() << "Guid:" << Guid << " Name:" << FuncName
+ << " does not exist in pseudo probe desc\n";
}
+#endif
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index f11b552..e950b23 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -15,6 +15,10 @@
#include "llvm/ADT/DenseMap.h"
+#ifndef NDEBUG
+#include "llvm/ADT/DenseSet.h"
+#endif
+
namespace llvm {
class AsmPrinter;
@@ -26,6 +30,13 @@ class PseudoProbeHandler {
// Name to GUID map, used as caching/memoization for speed.
DenseMap<StringRef, uint64_t> NameGuidMap;
+#ifndef NDEBUG
+ // All GUID in llvm.pseudo_probe_desc.
+ DenseSet<uint64_t> DescGuidSet;
+
+ void verifyGuidExistenceInDesc(uint64_t Guid, StringRef FuncName);
+#endif
+
public:
PseudoProbeHandler(AsmPrinter *A) : Asm(A) {};
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index dccd71f..13fd270 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -323,12 +323,6 @@ const MCExpr *WinException::getLabel(const MCSymbol *Label) {
Asm->OutContext);
}
-const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
- return MCBinaryExpr::createAdd(getLabel(Label),
- MCConstantExpr::create(1, Asm->OutContext),
- Asm->OutContext);
-}
-
const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
const MCSymbol *OffsetFrom) {
return MCBinaryExpr::createSub(
@@ -655,7 +649,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
AddComment("LabelStart");
OS.emitValue(getLabel(BeginLabel), 4);
AddComment("LabelEnd");
- OS.emitValue(getLabelPlusOne(EndLabel), 4);
+ OS.emitValue(getLabel(EndLabel), 4);
AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
: "CatchAll");
OS.emitValue(FilterOrFinally, 4);
@@ -950,13 +944,7 @@ void WinException::computeIP2StateTable(
if (!ChangeLabel)
ChangeLabel = StateChange.PreviousEndLabel;
// Emit an entry indicating that PCs after 'Label' have this EH state.
- // NOTE: On ARM architectures, the StateFromIp automatically takes into
- // account that the return address is after the call instruction (whose EH
- // state we should be using), but on other platforms we need to +1 to the
- // label so that we are using the correct EH state.
- const MCExpr *LabelExpression = (isAArch64 || isThumb)
- ? getLabel(ChangeLabel)
- : getLabelPlusOne(ChangeLabel);
+ const MCExpr *LabelExpression = getLabel(ChangeLabel);
IPToStateTable.push_back(
std::make_pair(LabelExpression, StateChange.NewState));
// FIXME: assert that NewState is between CatchLow and CatchHigh.
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index 638589a..47dd30c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -80,7 +80,6 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
const MCExpr *create32bitRef(const MCSymbol *Value);
const MCExpr *create32bitRef(const GlobalValue *GV);
const MCExpr *getLabel(const MCSymbol *Label);
- const MCExpr *getLabelPlusOne(const MCSymbol *Label);
const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
const MCSymbol *OffsetFrom);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index dc81843..c21058c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3571,9 +3571,7 @@ class TypePromotionTransaction {
}
// Record the debug uses separately. They are not in the instruction's
// use list, but they are replaced by RAUW.
- SmallVector<DbgValueInst *> DbgValues;
- findDbgValues(DbgValues, Inst, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(Inst, DbgVariableRecords);
// Now, we can replace the uses.
Inst->replaceAllUsesWith(New);
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 1286af8..974fc40 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1884,6 +1884,14 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
}
break;
}
+ case TargetOpcode::G_ASHR: {
+ Register Src1 = MI.getOperand(1).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
+ FirstAnswer = computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (auto C = getValidMinimumShiftAmount(Src2, DemandedElts, Depth + 1))
+ FirstAnswer = std::min<uint64_t>(FirstAnswer + *C, TyBits);
+ break;
+ }
case TargetOpcode::G_TRUNC: {
Register Src = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(Src);
@@ -2053,6 +2061,64 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, unsigned Depth) {
return computeNumSignBits(R, DemandedElts, Depth);
}
+std::optional<ConstantRange> GISelValueTracking::getValidShiftAmountRange(
+ Register R, const APInt &DemandedElts, unsigned Depth) {
+ // Shifting more than the bitwidth is not valid.
+ MachineInstr &MI = *MRI.getVRegDef(R);
+ unsigned Opcode = MI.getOpcode();
+
+ LLT Ty = MRI.getType(R);
+ unsigned BitWidth = Ty.getScalarSizeInBits();
+
+ if (Opcode == TargetOpcode::G_CONSTANT) {
+ const APInt &ShAmt = MI.getOperand(1).getCImm()->getValue();
+ if (ShAmt.uge(BitWidth))
+ return std::nullopt;
+ return ConstantRange(ShAmt);
+ }
+
+ if (Opcode == TargetOpcode::G_BUILD_VECTOR) {
+ const APInt *MinAmt = nullptr, *MaxAmt = nullptr;
+ for (unsigned I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
+ if (!DemandedElts[I])
+ continue;
+ MachineInstr *Op = MRI.getVRegDef(MI.getOperand(I + 1).getReg());
+ if (Op->getOpcode() != TargetOpcode::G_CONSTANT) {
+ MinAmt = MaxAmt = nullptr;
+ break;
+ }
+
+ const APInt &ShAmt = Op->getOperand(1).getCImm()->getValue();
+ if (ShAmt.uge(BitWidth))
+ return std::nullopt;
+ if (!MinAmt || MinAmt->ugt(ShAmt))
+ MinAmt = &ShAmt;
+ if (!MaxAmt || MaxAmt->ult(ShAmt))
+ MaxAmt = &ShAmt;
+ }
+ assert(((!MinAmt && !MaxAmt) || (MinAmt && MaxAmt)) &&
+ "Failed to find matching min/max shift amounts");
+ if (MinAmt && MaxAmt)
+ return ConstantRange(*MinAmt, *MaxAmt + 1);
+ }
+
+ // Use computeKnownBits to find a hidden constant/knownbits (usually type
+ // legalized). e.g. Hidden behind multiple bitcasts/build_vector/casts etc.
+ KnownBits KnownAmt = getKnownBits(R, DemandedElts, Depth);
+ if (KnownAmt.getMaxValue().ult(BitWidth))
+ return ConstantRange::fromKnownBits(KnownAmt, /*IsSigned=*/false);
+
+ return std::nullopt;
+}
+
+std::optional<uint64_t> GISelValueTracking::getValidMinimumShiftAmount(
+ Register R, const APInt &DemandedElts, unsigned Depth) {
+ if (std::optional<ConstantRange> AmtRange =
+ getValidShiftAmountRange(R, DemandedElts, Depth))
+ return AmtRange->getUnsignedMin().getZExtValue();
+ return std::nullopt;
+}
+
void GISelValueTrackingAnalysisLegacy::getAnalysisUsage(
AnalysisUsage &AU) const {
AU.setPreservesAll();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d7280ea..dc5dfab 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2189,23 +2189,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
: TargetOpcode::LIFETIME_END;
- // Get the underlying objects for the location passed on the lifetime
- // marker.
- SmallVector<const Value *, 4> Allocas;
- getUnderlyingObjects(CI.getArgOperand(1), Allocas);
-
- // Iterate over each underlying object, creating lifetime markers for each
- // static alloca. Quit if we find a non-static alloca.
- for (const Value *V : Allocas) {
- const AllocaInst *AI = dyn_cast<AllocaInst>(V);
- if (!AI)
- continue;
-
- if (!AI->isStaticAlloca())
- return true;
+ const AllocaInst *AI = cast<AllocaInst>(CI.getArgOperand(1));
+ if (!AI->isStaticAlloca())
+ return true;
- MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
- }
+ MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
return true;
}
case Intrinsic::fake_use: {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 11b3ac8..ed7b07f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -10120,14 +10120,10 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
return Legalized;
}
- bool IsVolatile = MemOp->isVolatile();
- // Don't try to optimize volatile.
- if (IsVolatile)
- return UnableToLegalize;
-
if (MaxLen && KnownLen > MaxLen)
return UnableToLegalize;
+ bool IsVolatile = MemOp->isVolatile();
if (Opc == TargetOpcode::G_MEMCPY) {
auto &MF = *MI.getParent()->getParent();
const auto &TLI = *MF.getSubtarget().getTargetLowering();
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index d2b2edf..df162fc 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -600,87 +600,113 @@ static Value *getMask(Value *WideMask, unsigned Factor,
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- Value *LoadedVal = DI->getOperand(0);
- if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
+ Instruction *LoadedVal = dyn_cast<Instruction>(DI->getOperand(0));
+ if (!LoadedVal || !LoadedVal->hasOneUse())
+ return false;
+
+ auto *LI = dyn_cast<LoadInst>(LoadedVal);
+ auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
+ if (!LI && !II)
return false;
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
assert(Factor && "unexpected deinterleave intrinsic");
Value *Mask = nullptr;
- if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
- if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
+ if (LI) {
+ if (!LI->isSimple())
return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
+ << " and factor = " << Factor << "\n");
+ } else {
+ assert(II);
+
// Check mask operand. Handle both all-true/false and interleaved mask.
- Value *WideMask = VPLoad->getOperand(1);
- Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
- if (!Mask)
+ Value *WideMask;
+ switch (II->getIntrinsicID()) {
+ default:
return false;
+ case Intrinsic::vp_load:
+ WideMask = II->getOperand(1);
+ break;
+ case Intrinsic::masked_load:
+ WideMask = II->getOperand(2);
+ break;
+ }
- LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
- << *DI << " and factor = " << Factor << "\n");
- } else {
- auto *LI = cast<LoadInst>(LoadedVal);
- if (!LI->isSimple())
+ Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
+ if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
- << " and factor = " << Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
+ << " intrinsic " << *DI << " and factor = "
+ << Factor << "\n");
}
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask,
- DI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
return false;
DeadInsts.insert(DI);
// We now have a target-specific load, so delete the old one.
- DeadInsts.insert(cast<Instruction>(LoadedVal));
+ DeadInsts.insert(LoadedVal);
return true;
}
bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
- IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!II->hasOneUse())
+ IntrinsicInst *IntII, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (!IntII->hasOneUse())
return false;
- Value *StoredBy = II->user_back();
- if (!isa<StoreInst, VPIntrinsic>(StoredBy))
+ Instruction *StoredBy = dyn_cast<Instruction>(IntII->user_back());
+ if (!StoredBy)
+ return false;
+ auto *SI = dyn_cast<StoreInst>(StoredBy);
+ auto *II = dyn_cast<IntrinsicInst>(StoredBy);
+ if (!SI && !II)
return false;
- SmallVector<Value *, 8> InterleaveValues(II->args());
- const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID());
+ SmallVector<Value *, 8> InterleaveValues(IntII->args());
+ const unsigned Factor = getInterleaveIntrinsicFactor(IntII->getIntrinsicID());
assert(Factor && "unexpected interleave intrinsic");
Value *Mask = nullptr;
- if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
- if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
+ if (II) {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Value *WideMask;
+ switch (II->getIntrinsicID()) {
+ default:
return false;
-
- Value *WideMask = VPStore->getOperand(2);
+ case Intrinsic::vp_store:
+ WideMask = II->getOperand(2);
+ break;
+ case Intrinsic::masked_store:
+ WideMask = II->getOperand(3);
+ break;
+ }
Mask = getMask(WideMask, Factor,
cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
- << *II << " and factor = " << Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
+ << " intrinsic " << *IntII << " and factor = "
+ << Factor << "\n");
} else {
- auto *SI = cast<StoreInst>(StoredBy);
if (!SI->isSimple())
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
- << " and factor = " << Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic "
+ << *IntII << " and factor = " << Factor << "\n");
}
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(cast<Instruction>(StoredBy), Mask,
- InterleaveValues))
+ if (!TLI->lowerInterleaveIntrinsicToStore(StoredBy, Mask, InterleaveValues))
return false;
// We now have a target-specific store, so delete the old one.
- DeadInsts.insert(cast<Instruction>(StoredBy));
- DeadInsts.insert(II);
+ DeadInsts.insert(StoredBy);
+ DeadInsts.insert(IntII);
return true;
}
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 38ad582..429a17a 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,9 +211,8 @@ void MachineFunction::init() {
ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
- // FIXME: Shouldn't use pref alignment if explicit alignment is set on F.
// FIXME: Use Function::hasOptSize().
- if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+ if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
Alignment = std::max(Alignment,
STI->getTargetLowering()->getPrefFunctionAlignment());
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b38a4d1c..90005bd 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
!TII->isGlobalMemoryObject(FromMI) &&
!TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
SDep Pred = Dep;
- Pred.setSUnit(Src);
- Dst->addPred(Pred);
+ Pred.setSUnit(From);
+ To->addPred(Pred);
}
}
}
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 76cba29..9d5c39c 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -771,24 +771,6 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
MI->isFakeUse();
}
-/// A region of an MBB for scheduling.
-namespace {
-struct SchedRegion {
- /// RegionBegin is the first instruction in the scheduling region, and
- /// RegionEnd is either MBB->end() or the scheduling boundary after the
- /// last instruction in the scheduling region. These iterators cannot refer
- /// to instructions outside of the identified scheduling region because
- /// those may be reordered before scheduling this region.
- MachineBasicBlock::iterator RegionBegin;
- MachineBasicBlock::iterator RegionEnd;
- unsigned NumRegionInstrs;
-
- SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
- unsigned N) :
- RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
-};
-} // end anonymous namespace
-
using MBBRegionsVector = SmallVector<SchedRegion, 16>;
static void
@@ -3725,7 +3707,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
RegionPolicy.OnlyBottomUp = true;
// Allow the subtarget to override default policy.
- MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
+ SchedRegion Region(Begin, End, NumRegionInstrs);
+ MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Region);
// After subtarget overrides, apply command line options.
if (!EnableRegPressure) {
@@ -4338,7 +4321,8 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
RegionPolicy.OnlyBottomUp = false;
// Allow the subtarget to override default policy.
- MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, NumRegionInstrs);
+ SchedRegion Region(Begin, End, NumRegionInstrs);
+ MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, Region);
// After subtarget overrides, apply command line options.
if (PostRADirection == MISched::TopDown) {
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 381249e..0b2a73b 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -5,35 +5,31 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// This file defines the RABasic function pass, which provides a minimal
-// implementation of the basic register allocator.
-//
+///
+/// \file
+/// This file defines the RABasic function pass, which provides a minimal
+/// implementation of the basic register allocator.
+///
//===----------------------------------------------------------------------===//
+#include "RegAllocBasic.h"
#include "AllocationOrder.h"
-#include "RegAllocBase.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/CodeGen/Spiller.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <queue>
using namespace llvm;
@@ -42,89 +38,8 @@ using namespace llvm;
static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
createBasicRegisterAllocator);
-namespace {
- struct CompSpillWeight {
- bool operator()(const LiveInterval *A, const LiveInterval *B) const {
- return A->weight() < B->weight();
- }
- };
-}
-
-namespace {
-/// RABasic provides a minimal implementation of the basic register allocation
-/// algorithm. It prioritizes live virtual registers by spill weight and spills
-/// whenever a register is unavailable. This is not practical in production but
-/// provides a useful baseline both for measuring other allocators and comparing
-/// the speed of the basic algorithm against other styles of allocators.
-class RABasic : public MachineFunctionPass,
- public RegAllocBase,
- private LiveRangeEdit::Delegate {
- // context
- MachineFunction *MF = nullptr;
-
- // state
- std::unique_ptr<Spiller> SpillerInstance;
- std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
- CompSpillWeight>
- Queue;
-
- // Scratch space. Allocated here to avoid repeated malloc calls in
- // selectOrSplit().
- BitVector UsableRegs;
-
- bool LRE_CanEraseVirtReg(Register) override;
- void LRE_WillShrinkVirtReg(Register) override;
-
-public:
- RABasic(const RegAllocFilterFunc F = nullptr);
-
- /// Return the pass name.
- StringRef getPassName() const override { return "Basic Register Allocator"; }
-
- /// RABasic analysis usage.
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
- void releaseMemory() override;
-
- Spiller &spiller() override { return *SpillerInstance; }
-
- void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
-
- const LiveInterval *dequeue() override {
- if (Queue.empty())
- return nullptr;
- const LiveInterval *LI = Queue.top();
- Queue.pop();
- return LI;
- }
-
- MCRegister selectOrSplit(const LiveInterval &VirtReg,
- SmallVectorImpl<Register> &SplitVRegs) override;
-
- /// Perform register allocation.
- bool runOnMachineFunction(MachineFunction &mf) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().setNoPHIs();
- }
-
- MachineFunctionProperties getClearedProperties() const override {
- return MachineFunctionProperties().setIsSSA();
- }
-
- // Helper for spilling all live virtual registers currently unified under preg
- // that interfere with the most recently queried lvr. Return true if spilling
- // was successful, and append any new spilled/split intervals to splitLVRs.
- bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
- SmallVectorImpl<Register> &SplitVRegs);
-
- static char ID;
-};
-
char RABasic::ID = 0;
-} // end anonymous namespace
-
char &llvm::RABasicID = RABasic::ID;
INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
diff --git a/llvm/lib/CodeGen/RegAllocBasic.h b/llvm/lib/CodeGen/RegAllocBasic.h
new file mode 100644
index 0000000..004bc1a
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocBasic.h
@@ -0,0 +1,104 @@
+//===-- RegAllocBasic.h - Basic Register Allocator Header -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the RABasic class, which provides a minimal
+/// implementation of the basic register allocator.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCBASIC_H
+#define LLVM_CODEGEN_REGALLOCBASIC_H
+
+#include "RegAllocBase.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Spiller.h"
+#include <queue>
+
+namespace llvm {
+
+struct CompSpillWeight {
+ bool operator()(const LiveInterval *A, const LiveInterval *B) const {
+ return A->weight() < B->weight();
+ }
+};
+
+/// RABasic provides a minimal implementation of the basic register allocation
+/// algorithm. It prioritizes live virtual registers by spill weight and spills
+/// whenever a register is unavailable. This is not practical in production but
+/// provides a useful baseline both for measuring other allocators and comparing
+/// the speed of the basic algorithm against other styles of allocators.
+class LLVM_LIBRARY_VISIBILITY RABasic : public MachineFunctionPass,
+ public RegAllocBase,
+ private LiveRangeEdit::Delegate {
+ // context
+ MachineFunction *MF = nullptr;
+
+ // state
+ std::unique_ptr<Spiller> SpillerInstance;
+ std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
+ CompSpillWeight>
+ Queue;
+
+ // Scratch space. Allocated here to avoid repeated malloc calls in
+ // selectOrSplit().
+ BitVector UsableRegs;
+
+ bool LRE_CanEraseVirtReg(Register) override;
+ void LRE_WillShrinkVirtReg(Register) override;
+
+public:
+ RABasic(const RegAllocFilterFunc F = nullptr);
+
+ /// Return the pass name.
+ StringRef getPassName() const override { return "Basic Register Allocator"; }
+
+ /// RABasic analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ void releaseMemory() override;
+
+ Spiller &spiller() override { return *SpillerInstance; }
+
+ void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
+
+ const LiveInterval *dequeue() override {
+ if (Queue.empty())
+ return nullptr;
+ const LiveInterval *LI = Queue.top();
+ Queue.pop();
+ return LI;
+ }
+
+ MCRegister selectOrSplit(const LiveInterval &VirtReg,
+ SmallVectorImpl<Register> &SplitVRegs) override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+
+ // Helper for spilling all live virtual registers currently unified under preg
+ // that interfere with the most recently queried lvr. Return true if spilling
+ // was successful, and append any new spilled/split intervals to splitLVRs.
+ bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
+ SmallVectorImpl<Register> &SplitVRegs);
+
+ static char ID;
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 9962070..908ed96 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -614,6 +614,13 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
Use &U = *AI->use_begin();
Instruction *User = cast<Instruction>(U.getUser());
+ // Drop lifetime markers now that this is no longer an alloca.
+ // SafeStack has already performed its own stack coloring.
+ if (User->isLifetimeStartOrEnd()) {
+ User->eraseFromParent();
+ continue;
+ }
+
Instruction *InsertBefore;
if (auto *PHI = dyn_cast<PHINode>(User))
InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fed5e72..d3df434 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12375,11 +12375,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
// Any flags available in a select/setcc fold will be on the setcc as they
// migrated from fcmp
- Flags = N0->getFlags();
- SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
- N2, N0.getOperand(2));
- SelectNode->setFlags(Flags);
- return SelectNode;
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2,
+ N0.getOperand(2), N0->getFlags());
}
if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL))
@@ -16738,7 +16735,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
- // conditions 1) one-use, and 2) does not produce poison then push
+ // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+ // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
// the freeze through to the operands that are not guaranteed non-poison.
// NOTE: we will strip poison-generating flags, so ignore them here.
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16746,6 +16744,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0->getNumValues() != 1 || !N0->hasOneUse())
return SDValue();
+ // TOOD: we should always allow multiple operands, however this increases the
+ // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+ // below causing later nodes that share frozen operands to fold again and no
+ // longer being able to confirm other operands are not poison due to recursion
+ // depth limits on isGuaranteedNotToBeUndefOrPoison.
+ bool AllowMultipleMaybePoisonOperands =
+ N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+ N0.getOpcode() == ISD::BUILD_VECTOR ||
+ N0.getOpcode() == ISD::BUILD_PAIR ||
+ N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
// ones" or "constant" into something that depends on FrozenUndef. We can
// instead pick undef values to keep those properties, while at the same time
@@ -16772,8 +16782,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
/*Depth*/ 1))
continue;
- if (MaybePoisonOperands.insert(Op).second)
+ bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
+ bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
+ if (IsNewMaybePoisonOperand)
MaybePoisonOperandNumbers.push_back(OpNo);
+ if (!HadMaybePoisonOperands)
+ continue;
+ if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+ // Multiple maybe-poison ops when not allowed - bail out.
+ return SDValue();
+ }
}
// NOTE: the whole op may be not guaranteed to not be undef or poison because
// it could create undef or poison due to it's poison-generating flags.
@@ -22727,11 +22745,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
- if (!LifetimeEnd->hasOffset())
- return SDValue();
-
- const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
- LifetimeEnd->getOffset(), false);
+ const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 0, false);
// We walk up the chains to find stores.
SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
@@ -29418,9 +29432,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
return {false /*isVolatile*/,
/*isAtomic*/ false,
LN->getOperand(1),
- (LN->hasOffset()) ? LN->getOffset() : 0,
- (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
- : LocationSize::beforeOrAfterPointer(),
+ 0,
+ LocationSize::precise(LN->getSize()),
(MachineMemOperand *)nullptr};
// Default.
return {false /*isvolatile*/,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7266940..74172b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2785,19 +2785,17 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
// In strict mode, we must avoid spurious exceptions, and therefore
// must make sure to only emit a single STRICT_SINT_TO_FP.
SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0);
- Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
- { Node->getOperand(0), InCvt });
- Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
- { Fast.getValue(1), Fast, Fast });
- Chain = Slow.getValue(1);
// The STRICT_SINT_TO_FP inherits the exception mode from the
// incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
// never raise any exception.
SDNodeFlags Flags;
Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
- Fast->setFlags(Flags);
+ Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DestVT, MVT::Other},
+ {Node->getOperand(0), InCvt}, Flags);
Flags.setNoFPExcept(true);
- Slow->setFlags(Flags);
+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {DestVT, MVT::Other},
+ {Fast.getValue(1), Fast, Fast}, Flags);
+ Chain = Slow.getValue(1);
} else {
SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
@@ -3407,14 +3405,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
EVT VT = Operand.getValueType();
SDValue One = DAG.getConstantFP(1.0, dl, VT);
SDValue Chain = DAG.getEntryNode();
- SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
- {Chain, Operand, One});
-
// Propagate existing flags on canonicalize, and additionally set
// NoFPExcept.
SDNodeFlags CanonicalizeFlags = Node->getFlags();
CanonicalizeFlags.setNoFPExcept(true);
- Mul->setFlags(CanonicalizeFlags);
+ SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
+ {Chain, Operand, One}, CanonicalizeFlags);
Results.push_back(Mul);
break;
@@ -4150,15 +4146,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
Tmp2 = Node->getOperand(1);
Tmp3 = Node->getOperand(2);
if (Tmp1.getOpcode() == ISD::SETCC) {
- Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
- Tmp2, Tmp3,
- cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+ Tmp1 = DAG.getSelectCC(
+ dl, Tmp1.getOperand(0), Tmp1.getOperand(1), Tmp2, Tmp3,
+ cast<CondCodeSDNode>(Tmp1.getOperand(2))->get(), Node->getFlags());
} else {
- Tmp1 = DAG.getSelectCC(dl, Tmp1,
- DAG.getConstant(0, dl, Tmp1.getValueType()),
- Tmp2, Tmp3, ISD::SETNE);
+ Tmp1 =
+ DAG.getSelectCC(dl, Tmp1, DAG.getConstant(0, dl, Tmp1.getValueType()),
+ Tmp2, Tmp3, ISD::SETNE, Node->getFlags());
}
- Tmp1->setFlags(Node->getFlags());
Results.push_back(Tmp1);
break;
case ISD::BR_JT: {
@@ -4296,8 +4291,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
EVT Tmp1VT = Tmp1.getValueType();
Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
DAG.getBoolConstant(true, dl, VT, Tmp1VT),
- DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3);
- Tmp1->setFlags(Node->getFlags());
+ DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3,
+ Node->getFlags());
Results.push_back(Tmp1);
break;
}
@@ -4335,8 +4330,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
// Use the new condition code and swap true and false
Legalized = true;
- Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
- Tmp1->setFlags(Node->getFlags());
+ Tmp1 =
+ DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC, Node->getFlags());
} else {
// If The inverse is not legal, then try to swap the arguments using
// the inverse condition code.
@@ -4345,8 +4340,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
// The swapped inverse condition is legal, so swap true and false,
// lhs and rhs.
Legalized = true;
- Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
- Tmp1->setFlags(Node->getFlags());
+ Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC,
+ Node->getFlags());
}
}
@@ -4365,15 +4360,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
// condition code, create a new SELECT_CC node.
if (CC.getNode()) {
- Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
- Tmp1, Tmp2, Tmp3, Tmp4, CC);
+ Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
+ Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
} else {
Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
CC = DAG.getCondCode(ISD::SETNE);
Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
- Tmp2, Tmp3, Tmp4, CC);
+ Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
}
- Tmp1->setFlags(Node->getFlags());
}
Results.push_back(Tmp1);
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f908a66..d2ecc133 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -2087,11 +2087,10 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node,
// Otherwise, SETCC for the given comparison type must be completely
// illegal; expand it into a SELECT_CC.
EVT VT = Node->getValueType(0);
- LHS =
- DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
- DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
- DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC);
- LHS->setFlags(Node->getFlags());
+ LHS = DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
+ DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
+ DAG.getBoolConstant(false, dl, VT, LHS.getValueType()),
+ CC, Node->getFlags());
}
Results.push_back(LHS);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 32c5961..1661814 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -372,9 +372,9 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
SDVTList ScalarVTs = DAG.getVTList(
ResVT.getVectorElementType(), OvVT.getVectorElementType());
- SDNode *ScalarNode = DAG.getNode(
- N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
- ScalarNode->setFlags(N->getFlags());
+ SDNode *ScalarNode = DAG.getNode(N->getOpcode(), DL, ScalarVTs,
+ {ScalarLHS, ScalarRHS}, N->getFlags())
+ .getNode();
// Replace the other vector result not being explicitly scalarized here.
unsigned OtherNo = 1 - ResNo;
@@ -1898,7 +1898,7 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
NE = ResNE;
//The results of each unrolled operation, including the chain.
- EVT ChainVTs[] = {EltVT, MVT::Other};
+ SDVTList ChainVTs = DAG.getVTList(EltVT, MVT::Other);
SmallVector<SDValue, 8> Chains;
unsigned i;
@@ -1914,8 +1914,8 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
Operands[j] = Operand;
}
}
- SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
- Scalar.getNode()->setFlags(N->getFlags());
+ SDValue Scalar =
+ DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands, N->getFlags());
//Add in the scalar as well as its chain value to the
//result vectors.
@@ -1956,10 +1956,10 @@ void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
unsigned Opcode = N->getOpcode();
SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
- SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
- SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
- LoNode->setFlags(N->getFlags());
- HiNode->setFlags(N->getFlags());
+ SDNode *LoNode =
+ DAG.getNode(Opcode, dl, LoVTs, {LoLHS, LoRHS}, N->getFlags()).getNode();
+ SDNode *HiNode =
+ DAG.getNode(Opcode, dl, HiVTs, {HiLHS, HiRHS}, N->getFlags()).getNode();
Lo = SDValue(LoNode, ResNo);
Hi = SDValue(HiNode, ResNo);
@@ -2669,10 +2669,8 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOpWithTwoResults(SDNode *N,
else
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo);
- Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi);
- Lo->setFlags(N->getFlags());
- Hi->setFlags(N->getFlags());
+ Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo, N->getFlags());
+ Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi, N->getFlags());
SDNode *HiNode = Hi.getNode();
SDNode *LoNode = Lo.getNode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2458115..773ff48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -786,10 +786,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
break;
case ISD::LIFETIME_START:
case ISD::LIFETIME_END:
- if (cast<LifetimeSDNode>(N)->hasOffset()) {
- ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
- ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
- }
+ ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
break;
case ISD::PSEUDO_PROBE:
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
@@ -3036,7 +3033,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, *this,
Depth);
break;
-}
+ }
// We don't support other cases than those above for scalable vectors at
// the moment.
@@ -9364,7 +9361,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
SDValue Chain, int FrameIndex,
- int64_t Size, int64_t Offset) {
+ int64_t Size) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
@@ -9377,13 +9374,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
ID.AddInteger(Size);
- ID.AddInteger(Offset);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
- LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
- Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
+ LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
+ dl.getDebugLoc(), VTs, Size);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
@@ -10563,7 +10559,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDUse> Ops) {
switch (Ops.size()) {
case 0: return getNode(Opcode, DL, VT);
- case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+ case 1: return getNode(Opcode, DL, VT, Ops[0].get());
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
@@ -10699,7 +10695,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
- return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+ SDNodeFlags Flags;
+ if (Inserter)
+ Flags = Inserter->getFlags();
+ return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+ ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops,
+ const SDNodeFlags Flags) {
+ return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -10855,26 +10860,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
(Ops[2]->getAsZExtVal() == 0 || Ops[2]->getAsZExtVal() == 1) &&
"Invalid STRICT_FP_ROUND!");
break;
-#if 0
- // FIXME: figure out how to safely handle things like
- // int foo(int x) { return 1 << (x & 255); }
- // int bar() { return foo(256); }
- case ISD::SRA_PARTS:
- case ISD::SRL_PARTS:
- case ISD::SHL_PARTS:
- if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
- return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
- else if (N3.getOpcode() == ISD::AND)
- if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
- // If the and is only masking out bits that cannot effect the shift,
- // eliminate the and.
- unsigned NumBits = VT.getScalarSizeInBits()*2;
- if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
- return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
- }
- break;
-#endif
}
// Memoize the node unless it returns a glue result.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index da92aaa..8f08046 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -303,10 +303,7 @@ BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
return matchLSNode(LS0, DAG);
if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
- if (LN->hasOffset())
- return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
- false);
- return BaseIndexOffset(LN->getOperand(1), SDValue(), false);
+ return BaseIndexOffset(LN->getOperand(1), SDValue(), 0, false);
}
return BaseIndexOffset();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 01e5312..1636465 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7596,32 +7596,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
const int64_t ObjectSize =
cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
- Value *const ObjectPtr = I.getArgOperand(1);
- SmallVector<const Value *, 4> Allocas;
- getUnderlyingObjects(ObjectPtr, Allocas);
+ const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
- for (const Value *Alloca : Allocas) {
- const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(Alloca);
-
- // Could not find an Alloca.
- if (!LifetimeObject)
- continue;
-
- // First check that the Alloca is static, otherwise it won't have a
- // valid frame index.
- auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
- if (SI == FuncInfo.StaticAllocaMap.end())
- return;
+ // First check that the Alloca is static, otherwise it won't have a
+ // valid frame index.
+ auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+ if (SI == FuncInfo.StaticAllocaMap.end())
+ return;
- const int FrameIndex = SI->second;
- int64_t Offset;
- if (GetPointerBaseWithConstantOffset(
- ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
- Offset = -1; // Cannot determine offset from alloca to lifetime object.
- Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
- Offset);
- DAG.setRoot(Res);
- }
+ const int FrameIndex = SI->second;
+ Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+ DAG.setRoot(Res);
return;
}
case Intrinsic::pseudoprobe: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7fc1558..9474587 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -947,8 +947,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
<< ASC->getDestAddressSpace()
<< ']';
} else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
- if (LN->hasOffset())
- OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">";
+ OS << "<0 to " << LN->getSize() << ">";
} else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
OS << '<' << AA->getAlign().value() << '>';
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e059798..1764910 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false, Depth + 1))
return N0;
break;
}
@@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false,
+ Depth + 1))
return TLO.CombineTo(Op, N0);
// TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
@@ -8128,7 +8129,7 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
return ISD::matchUnaryPredicate(
Z,
[=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
- /*AllowUndef=*/true, /*AllowTruncation=*/true);
+ /*AllowUndefs=*/true, /*AllowTruncation=*/true);
}
static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
@@ -8633,9 +8634,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
return SDValue();
SDValue Op1 = Node->getOperand(0);
SDValue Op2 = Node->getOperand(1);
- SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
- SelCC->setFlags(Node->getFlags());
- return SelCC;
+ return DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred,
+ Node->getFlags());
}
return SDValue();
@@ -11994,8 +11994,7 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
// Get the mask value and add it to the current output position. This
// either increments by 1 if MaskI is true or adds 0 otherwise.
// Freeze in case we have poison/undef mask entries.
- SDValue MaskI =
- DAG.getFreeze(DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I));
+ SDValue MaskI = DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I);
MaskI = DAG.getFreeze(MaskI);
MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
index 93a567e..64f1bfc0 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
@@ -263,7 +263,7 @@ bool LVScope::removeElement(LVElement *Element) {
return Item == Element;
};
auto RemoveElement = [Element, Predicate](auto &Container) -> bool {
- auto Iter = std::remove_if(Container->begin(), Container->end(), Predicate);
+ auto Iter = llvm::remove_if(*Container, Predicate);
if (Iter != Container->end()) {
Container->erase(Iter, Container->end());
Element->resetParent();
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index cca9959..ffc7696 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -738,6 +738,32 @@ static inline uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo) {
return Hi == 63 ? Val >> Lo : (Val & (((1ULL << (Hi + 1)) - 1))) >> Lo;
}
+// Calculate the adjusted page delta between dest and PC. The code is copied
+// from lld and see comments there for more details.
+static uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc,
+ uint32_t type) {
+ uint64_t pcalau12i_pc;
+ switch (type) {
+ case ELF::R_LARCH_PCALA64_LO20:
+ case ELF::R_LARCH_GOT64_PC_LO20:
+ pcalau12i_pc = pc - 8;
+ break;
+ case ELF::R_LARCH_PCALA64_HI12:
+ case ELF::R_LARCH_GOT64_PC_HI12:
+ pcalau12i_pc = pc - 12;
+ break;
+ default:
+ pcalau12i_pc = pc;
+ break;
+ }
+ uint64_t result = (dest & ~0xfffULL) - (pcalau12i_pc & ~0xfffULL);
+ if (dest & 0x800)
+ result += 0x1000 - 0x1'0000'0000;
+ if (result & 0x8000'0000)
+ result += 0x1'0000'0000;
+ return result;
+}
+
void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
uint64_t Offset,
uint64_t Value, uint32_t Type,
@@ -789,10 +815,7 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
case ELF::R_LARCH_GOT_PC_HI20:
case ELF::R_LARCH_PCALA_HI20: {
uint64_t Target = Value + Addend;
- uint64_t TargetPage =
- (Target + (Target & 0x800)) & ~static_cast<uint64_t>(0xfff);
- uint64_t PCPage = FinalAddress & ~static_cast<uint64_t>(0xfff);
- int64_t PageDelta = TargetPage - PCPage;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
auto Instr = support::ulittle32_t::ref(TargetPtr);
uint32_t Imm31_12 = extractBits(PageDelta, /*Hi=*/31, /*Lo=*/12) << 5;
Instr = (Instr & 0xfe00001f) | Imm31_12;
@@ -806,6 +829,24 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
Instr = (Instr & 0xffc003ff) | Imm11_0;
break;
}
+ case ELF::R_LARCH_GOT64_PC_LO20:
+ case ELF::R_LARCH_PCALA64_LO20: {
+ uint64_t Target = Value + Addend;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+ auto Instr = support::ulittle32_t::ref(TargetPtr);
+ uint32_t Imm51_32 = extractBits(PageDelta, /*Hi=*/51, /*Lo=*/32) << 5;
+ Instr = (Instr & 0xfe00001f) | Imm51_32;
+ break;
+ }
+ case ELF::R_LARCH_GOT64_PC_HI12:
+ case ELF::R_LARCH_PCALA64_HI12: {
+ uint64_t Target = Value + Addend;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+ auto Instr = support::ulittle32_t::ref(TargetPtr);
+ uint32_t Imm63_52 = extractBits(PageDelta, /*Hi=*/63, /*Lo=*/52) << 10;
+ Instr = (Instr & 0xffc003ff) | Imm63_52;
+ break;
+ }
case ELF::R_LARCH_ABS_HI20: {
uint64_t Target = Value + Addend;
auto Instr = support::ulittle32_t::ref(TargetPtr);
@@ -1758,7 +1799,9 @@ RuntimeDyldELF::processRelocationRef(
MemMgr.allowStubAllocation()) {
resolveLoongArch64Branch(SectionID, Value, RelI, Stubs);
} else if (RelType == ELF::R_LARCH_GOT_PC_HI20 ||
- RelType == ELF::R_LARCH_GOT_PC_LO12) {
+ RelType == ELF::R_LARCH_GOT_PC_LO12 ||
+ RelType == ELF::R_LARCH_GOT64_PC_HI12 ||
+ RelType == ELF::R_LARCH_GOT64_PC_LO20) {
uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_LARCH_64);
resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
RelType);
@@ -2936,7 +2979,9 @@ bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
if (Arch == Triple::loongarch64)
return RelTy == ELF::R_LARCH_GOT_PC_HI20 ||
- RelTy == ELF::R_LARCH_GOT_PC_LO12;
+ RelTy == ELF::R_LARCH_GOT_PC_LO12 ||
+ RelTy == ELF::R_LARCH_GOT64_PC_HI12 ||
+ RelTy == ELF::R_LARCH_GOT64_PC_LO20;
if (Arch == Triple::x86_64)
return RelTy == ELF::R_X86_64_GOTPCREL ||
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 145ef10..e5a4e1e 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Out << "amdgpu_gfx_whole_wave";
+ break;
case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break;
case CallingConv::RISCV_VectorCall:
Out << "riscv_vector_cc";
@@ -2398,8 +2401,9 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) {
// Print all values for checksum together, or not at all.
if (N->getChecksum())
Printer.printChecksum(*N->getChecksum());
- Printer.printString("source", N->getSource().value_or(StringRef()),
- /* ShouldSkipEmpty */ true);
+ if (N->getSource())
+ Printer.printString("source", *N->getSource(),
+ /* ShouldSkipEmpty */ false);
Out << ")";
}
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 86285a0..28ed1e5 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1310,6 +1310,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
return true;
}
break;
+ case 'l':
+ if (Name.starts_with("lifetime.start") ||
+ Name.starts_with("lifetime.end")) {
+ // Unless remangling is required, do not upgrade the function declaration,
+ // but do upgrade the calls.
+ if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
+ NewFn = *Result;
+ else
+ NewFn = F;
+ return true;
+ }
+ break;
case 'm': {
// Updating the memory intrinsics (memcpy/memmove/memset) that have an
// alignment parameter to embedding the alignment as an attribute of
@@ -1629,7 +1641,6 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
NewFn = nullptr;
bool Upgraded =
upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
- assert(F != NewFn && "Intrinsic function upgraded to the same function");
// Upgrade intrinsic attributes. This does not change the function.
if (NewFn)
@@ -4570,6 +4581,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
}
const auto &DefaultCase = [&]() -> void {
+ if (F == NewFn)
+ return;
+
if (CI->getFunctionType() == NewFn->getFunctionType()) {
// Handle generic mangling change.
assert(
@@ -5109,6 +5123,31 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
MTI->setSourceAlignment(Align->getMaybeAlignValue());
break;
}
+
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end: {
+ Value *Size = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ if (isa<AllocaInst>(Ptr)) {
+ DefaultCase();
+ return;
+ }
+
+ // Try to strip pointer casts, such that the lifetime works on an alloca.
+ Ptr = Ptr->stripPointerCasts();
+ if (isa<AllocaInst>(Ptr)) {
+ // Don't use NewFn, as we might have looked through an addrspacecast.
+ if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
+ NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+ else
+ NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+ break;
+ }
+
+ // Otherwise remove the lifetime marker.
+ CI->eraseFromParent();
+ return;
+ }
}
assert(NewCall && "Should have either set this variable or returned through "
"the default case");
@@ -5131,7 +5170,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) {
UpgradeIntrinsicCall(CB, NewFn);
// Remove old function, no longer used, from the module.
- F->eraseFromParent();
+ if (F != NewFn)
+ F->eraseFromParent();
}
}
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 8fb33c3..ab8ecee 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -45,25 +45,6 @@ using namespace llvm;
using namespace llvm::at;
using namespace llvm::dwarf;
-TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) {
- // This function is hot. Check whether the value has any metadata to avoid a
- // DenseMap lookup. This check is a bitfield datamember lookup.
- if (!V->isUsedByMetadata())
- return {};
- auto *L = ValueAsMetadata::getIfExists(V);
- if (!L)
- return {};
- auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
- if (!MDV)
- return {};
-
- TinyPtrVector<DbgDeclareInst *> Declares;
- for (User *U : MDV->users())
- if (auto *DDI = dyn_cast<DbgDeclareInst>(U))
- Declares.push_back(DDI);
-
- return Declares;
-}
TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
// This function is hot. Check whether the value has any metadata to avoid a
// DenseMap lookup. This check is a bitfield datamember lookup.
@@ -98,42 +79,31 @@ TinyPtrVector<DbgVariableRecord *> llvm::findDVRValues(Value *V) {
return Values;
}
-template <typename IntrinsicT, bool DbgAssignAndValuesOnly>
+template <bool DbgAssignAndValuesOnly>
static void
-findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+findDbgIntrinsics(Value *V,
+ SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
// This function is hot. Check whether the value has any metadata to avoid a
// DenseMap lookup.
if (!V->isUsedByMetadata())
return;
- LLVMContext &Ctx = V->getContext();
// TODO: If this value appears multiple times in a DIArgList, we should still
- // only add the owning DbgValueInst once; use this set to track ArgListUsers.
+ // only add the owning dbg.value once; use this set to track ArgListUsers.
// This behaviour can be removed when we can automatically remove duplicates.
// V will also appear twice in a dbg.assign if its used in the both the value
// and address components.
- SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics;
SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords;
- /// Append IntrinsicT users of MetadataAsValue(MD).
- auto AppendUsers = [&Ctx, &EncounteredIntrinsics,
- &EncounteredDbgVariableRecords, &Result,
- DbgVariableRecords](Metadata *MD) {
- if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
- for (User *U : MDV->users())
- if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
- if (EncounteredIntrinsics.insert(DVI).second)
- Result.push_back(DVI);
- }
- if (!DbgVariableRecords)
- return;
+ /// Append users of MetadataAsValue(MD).
+ auto AppendUsers = [&EncounteredDbgVariableRecords,
+ &DbgVariableRecords](Metadata *MD) {
// Get DbgVariableRecords that use this as a single value.
if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
if (EncounteredDbgVariableRecords.insert(DVR).second)
- DbgVariableRecords->push_back(DVR);
+ DbgVariableRecords.push_back(DVR);
}
}
};
@@ -142,29 +112,23 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
AppendUsers(L);
for (Metadata *AL : L->getAllArgListUsers()) {
AppendUsers(AL);
- if (!DbgVariableRecords)
- continue;
DIArgList *DI = cast<DIArgList>(AL);
for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
if (EncounteredDbgVariableRecords.insert(DVR).second)
- DbgVariableRecords->push_back(DVR);
+ DbgVariableRecords.push_back(DVR);
}
}
}
void llvm::findDbgValues(
- SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
- findDbgIntrinsics<DbgValueInst, /*DbgAssignAndValuesOnly=*/true>(
- DbgValues, V, DbgVariableRecords);
+ Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+ findDbgIntrinsics</*DbgAssignAndValuesOnly=*/true>(V, DbgVariableRecords);
}
void llvm::findDbgUsers(
- SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
- findDbgIntrinsics<DbgVariableIntrinsic, /*DbgAssignAndValuesOnly=*/false>(
- DbgUsers, V, DbgVariableRecords);
+ Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+ findDbgIntrinsics</*DbgAssignAndValuesOnly=*/false>(V, DbgVariableRecords);
}
DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
@@ -173,18 +137,6 @@ DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
return nullptr;
}
-DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
- // Original dbg.declare must have a location.
- const DebugLoc &DeclareLoc = DII->getDebugLoc();
- MDNode *Scope = DeclareLoc.getScope();
- DILocation *InlinedAt = DeclareLoc.getInlinedAt();
- // Because no machine insts can come from debug intrinsics, only the scope
- // and inlinedAt is significant. Zero line numbers are used in case this
- // DebugLoc leaks into any adjacent instructions. Produce an unknown location
- // with the correct scope / inlinedAt fields.
- return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
-}
-
DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) {
// Original dbg.declare must have a location.
const DebugLoc &DeclareLoc = DVR->getDebugLoc();
@@ -852,19 +804,6 @@ void DebugTypeInfoRemoval::traverse(MDNode *N) {
bool llvm::stripNonLineTableDebugInfo(Module &M) {
bool Changed = false;
- // First off, delete the debug intrinsics.
- auto RemoveUses = [&](StringRef Name) {
- if (auto *DbgVal = M.getFunction(Name)) {
- while (!DbgVal->use_empty())
- cast<Instruction>(DbgVal->user_back())->eraseFromParent();
- DbgVal->eraseFromParent();
- Changed = true;
- }
- };
- RemoveUses("llvm.dbg.declare");
- RemoveUses("llvm.dbg.label");
- RemoveUses("llvm.dbg.value");
-
// Delete non-CU debug info named metadata nodes.
for (auto NMI = M.named_metadata_begin(), NME = M.named_metadata_end();
NMI != NME;) {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 7a03663..fc06745 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::WASM_EmscriptenInvoke:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
case CallingConv::M68k_INTR:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index f0448b0..0dbd07f 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1303,6 +1303,24 @@ static void addRange(SmallVectorImpl<ConstantInt *> &EndPoints,
EndPoints.push_back(High);
}
+MDNode *MDNode::getMergedCalleeTypeMetadata(const MDNode *A, const MDNode *B) {
+ // Drop the callee_type metadata if either of the call instructions do not
+ // have it.
+ if (!A || !B)
+ return nullptr;
+ SmallVector<Metadata *, 8> AB;
+ SmallPtrSet<Metadata *, 8> MergedCallees;
+ auto AddUniqueCallees = [&AB, &MergedCallees](const MDNode *N) {
+ for (Metadata *MD : N->operands()) {
+ if (MergedCallees.insert(MD).second)
+ AB.push_back(MD);
+ }
+ };
+ AddUniqueCallees(A);
+ AddUniqueCallees(B);
+ return MDNode::get(A->getContext(), AB);
+}
+
MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
// Given two ranges, we want to compute the union of the ranges. This
// is slightly complicated by having to combine the intervals and merge
diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp
index 94ad124..70bbe8f 100644
--- a/llvm/lib/IR/PassInstrumentation.cpp
+++ b/llvm/lib/IR/PassInstrumentation.cpp
@@ -23,6 +23,7 @@ template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Loop *>;
void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName,
StringRef PassName) {
+ assert(!PassName.empty() && "PassName can't be empty!");
ClassToPassName.try_emplace(ClassName, PassName.str());
}
@@ -33,7 +34,10 @@ PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
Fn();
ClassToPassNameCallbacks.clear();
}
- return ClassToPassName[ClassName];
+ auto PassNameIter = ClassToPassName.find(ClassName);
+ if (PassNameIter != ClassToPassName.end())
+ return PassNameIter->second;
+ return {};
}
AnalysisKey PassInstrumentationAnalysis::Key;
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 02c16e2..5928c89 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -582,16 +582,11 @@ void Value::replaceUsesWithIf(Value *New,
}
}
-/// Replace llvm.dbg.* uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
+/// Replace debug record uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
/// with New.
static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) {
- SmallVector<DbgVariableIntrinsic *> DbgUsers;
SmallVector<DbgVariableRecord *> DPUsers;
- findDbgUsers(DbgUsers, V, &DPUsers);
- for (auto *DVI : DbgUsers) {
- if (DVI->getParent() != BB)
- DVI->replaceVariableLocationOp(V, New);
- }
+ findDbgUsers(V, DPUsers);
for (auto *DVR : DPUsers) {
DbgMarker *Marker = DVR->getMarker();
if (Marker->getParent() != BB)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8c8ed3c..3ff9895 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -531,6 +531,7 @@ private:
void visitCallStackMetadata(MDNode *MD);
void visitMemProfMetadata(Instruction &I, MDNode *MD);
void visitCallsiteMetadata(Instruction &I, MDNode *MD);
+ void visitCalleeTypeMetadata(Instruction &I, MDNode *MD);
void visitDIAssignIDMetadata(Instruction &I, MDNode *MD);
void visitMMRAMetadata(Instruction &I, MDNode *MD);
void visitAnnotationMetadata(MDNode *Annotation);
@@ -2978,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
"perfect forwarding!",
&F);
break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
+ "Calling convention requires first argument to be i1", &F);
+ Check(!F.arg_begin()->hasInRegAttr(),
+ "Calling convention requires first argument to not be inreg", &F);
+ Check(!F.isVarArg(),
+ "Calling convention does not support varargs or "
+ "perfect forwarding!",
+ &F);
+ break;
}
// Check that the argument values match the function type for this function...
@@ -5193,6 +5204,33 @@ void Verifier::visitCallsiteMetadata(Instruction &I, MDNode *MD) {
visitCallStackMetadata(MD);
}
+static inline bool isConstantIntMetadataOperand(const Metadata *MD) {
+ if (auto *VAL = dyn_cast<ValueAsMetadata>(MD))
+ return isa<ConstantInt>(VAL->getValue());
+ return false;
+}
+
+void Verifier::visitCalleeTypeMetadata(Instruction &I, MDNode *MD) {
+ Check(isa<CallBase>(I), "!callee_type metadata should only exist on calls",
+ &I);
+ for (Metadata *Op : MD->operands()) {
+ Check(isa<MDNode>(Op),
+ "The callee_type metadata must be a list of type metadata nodes", Op);
+ auto *TypeMD = cast<MDNode>(Op);
+ Check(TypeMD->getNumOperands() == 2,
+ "Well-formed generalized type metadata must contain exactly two "
+ "operands",
+ Op);
+ Check(isConstantIntMetadataOperand(TypeMD->getOperand(0)) &&
+ mdconst::extract<ConstantInt>(TypeMD->getOperand(0))->isZero(),
+ "The first operand of type metadata for functions must be zero", Op);
+ Check(TypeMD->hasGeneralizedMDString(),
+ "Only generalized type metadata can be part of the callee_type "
+ "metadata list",
+ Op);
+ }
+}
+
void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
Check(isa<MDTuple>(Annotation), "annotation must be a tuple");
Check(Annotation->getNumOperands() >= 1,
@@ -5470,6 +5508,9 @@ void Verifier::visitInstruction(Instruction &I) {
if (MDNode *MD = I.getMetadata(LLVMContext::MD_callsite))
visitCallsiteMetadata(I, MD);
+ if (MDNode *MD = I.getMetadata(LLVMContext::MD_callee_type))
+ visitCalleeTypeMetadata(I, MD);
+
if (MDNode *MD = I.getMetadata(LLVMContext::MD_DIAssignID))
visitDIAssignIDMetadata(I, MD);
@@ -6627,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
break;
}
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ Value *Src0 = Call.getArgOperand(1);
+ Value *Src1 = Call.getArgOperand(3);
+
+ unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue();
+ unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
+ Check(FmtA <= 4, "invalid value for matrix format", Call,
+ Call.getArgOperand(0));
+ Check(FmtB <= 4, "invalid value for matrix format", Call,
+ Call.getArgOperand(2));
+
+ // AMDGPU::MatrixFMT values
+ auto getFormatNumRegs = [](unsigned FormatVal) {
+ switch (FormatVal) {
+ case 0:
+ case 1:
+ return 16u;
+ case 2:
+ case 3:
+ return 12u;
+ case 4:
+ return 8u;
+ default:
+ llvm_unreachable("invalid format value");
+ }
+ };
+
+ auto isValidSrcASrcBVector = [](FixedVectorType *Ty) {
+ if (!Ty || !Ty->getElementType()->isIntegerTy(32))
+ return false;
+ unsigned NumElts = Ty->getNumElements();
+ return NumElts == 16 || NumElts == 12 || NumElts == 8;
+ };
+
+ auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType());
+ auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType());
+ Check(isValidSrcASrcBVector(Src0Ty),
+ "operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0);
+ Check(isValidSrcASrcBVector(Src1Ty),
+ "operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1);
+
+ // Permit excess registers for the format.
+ Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA),
+ "invalid vector type for format", &Call, Src0, Call.getArgOperand(0));
+ Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB),
+ "invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
+ break;
+ }
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
Value *V = Call.getArgOperand(0);
@@ -6679,6 +6768,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"llvm.threadlocal.address operand isThreadLocal() must be true");
break;
}
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ Check(isa<AllocaInst>(Call.getArgOperand(1)),
+ "llvm.lifetime.start/end can only be used on alloca", &Call);
+ break;
};
// Verify that there aren't any unmediated control transfers between funclets.
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 3e96bdf..2b56e2a 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -196,12 +196,12 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
switch (F.getKind()) {
case MCFragment::FT_Data:
case MCFragment::FT_Relaxable:
+ case MCFragment::FT_Align:
case MCFragment::FT_LEB:
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame:
case MCFragment::FT_CVInlineLines:
case MCFragment::FT_CVDefRange:
- case MCFragment::FT_PseudoProbe:
return F.getSize();
case MCFragment::FT_Fill: {
auto &FF = cast<MCFillFragment>(F);
@@ -227,28 +227,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
case MCFragment::FT_SymbolId:
return 4;
- case MCFragment::FT_Align: {
- const MCAlignFragment &AF = cast<MCAlignFragment>(F);
- unsigned Offset = getFragmentOffset(AF);
- unsigned Size = offsetToAlignment(Offset, AF.getAlignment());
-
- // Insert extra Nops for code alignment if the target define
- // shouldInsertExtraNopBytesForCodeAlign target hook.
- if (AF.getParent()->useCodeAlign() && AF.hasEmitNops() &&
- getBackend().shouldInsertExtraNopBytesForCodeAlign(AF, Size))
- return Size;
-
- // If we are padding with nops, force the padding to be larger than the
- // minimum nop size.
- if (Size > 0 && AF.hasEmitNops()) {
- while (Size % getBackend().getMinimumNopSize())
- Size += AF.getAlignment().value();
- }
- if (Size > AF.getMaxBytesToEmit())
- return 0;
- return Size;
- }
-
case MCFragment::FT_Org: {
const MCOrgFragment &OF = cast<MCOrgFragment>(F);
MCValue Value;
@@ -384,7 +362,7 @@ uint64_t MCAssembler::getSectionAddressSize(const MCSection &Sec) const {
uint64_t MCAssembler::getSectionFileSize(const MCSection &Sec) const {
// Virtual sections have no file size.
- if (Sec.isVirtualSection())
+ if (Sec.isBssSection())
return 0;
return getSectionAddressSize(Sec);
}
@@ -424,8 +402,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame:
case MCFragment::FT_CVInlineLines:
- case MCFragment::FT_CVDefRange:
- case MCFragment::FT_PseudoProbe: {
+ case MCFragment::FT_CVDefRange: {
if (F.getKind() == MCFragment::FT_Data)
++stats::EmittedDataFragments;
else if (F.getKind() == MCFragment::FT_Relaxable)
@@ -433,48 +410,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
const auto &EF = cast<MCFragment>(F);
OS << StringRef(EF.getContents().data(), EF.getContents().size());
OS << StringRef(EF.getVarContents().data(), EF.getVarContents().size());
- break;
- }
+ } break;
+
case MCFragment::FT_Align: {
++stats::EmittedAlignFragments;
- const MCAlignFragment &AF = cast<MCAlignFragment>(F);
- assert(AF.getFillLen() && "Invalid virtual align in concrete fragment!");
+ OS << StringRef(F.getContents().data(), F.getContents().size());
+ assert(F.getAlignFillLen() &&
+ "Invalid virtual align in concrete fragment!");
- uint64_t Count = FragmentSize / AF.getFillLen();
- assert(FragmentSize % AF.getFillLen() == 0 &&
+ uint64_t Count = (FragmentSize - F.getFixedSize()) / F.getAlignFillLen();
+ assert((FragmentSize - F.getFixedSize()) % F.getAlignFillLen() == 0 &&
"computeFragmentSize computed size is incorrect");
- // See if we are aligning with nops, and if so do that first to try to fill
- // the Count bytes. Then if that did not fill any bytes or there are any
- // bytes left to fill use the Value and ValueSize to fill the rest.
- // If we are aligning with nops, ask that target to emit the right data.
- if (AF.hasEmitNops()) {
- if (!Asm.getBackend().writeNopData(OS, Count, AF.getSubtargetInfo()))
- report_fatal_error("unable to write nop sequence of " +
- Twine(Count) + " bytes");
- break;
- }
-
- // Otherwise, write out in multiples of the value size.
- for (uint64_t i = 0; i != Count; ++i) {
- switch (AF.getFillLen()) {
- default: llvm_unreachable("Invalid size!");
- case 1:
- OS << char(AF.getFill());
- break;
- case 2:
- support::endian::write<uint16_t>(OS, AF.getFill(), Endian);
- break;
- case 4:
- support::endian::write<uint32_t>(OS, AF.getFill(), Endian);
- break;
- case 8:
- support::endian::write<uint64_t>(OS, AF.getFill(), Endian);
- break;
+ // In the nops mode, call the backend hook to write `Count` nops.
+ if (F.hasAlignEmitNops()) {
+ if (!Asm.getBackend().writeNopData(OS, Count, F.getSubtargetInfo()))
+ reportFatalInternalError("unable to write nop sequence of " +
+ Twine(Count) + " bytes");
+ } else {
+ // Otherwise, write out in multiples of the value size.
+ for (uint64_t i = 0; i != Count; ++i) {
+ switch (F.getAlignFillLen()) {
+ default:
+ llvm_unreachable("Invalid size!");
+ case 1:
+ OS << char(F.getAlignFill());
+ break;
+ case 2:
+ support::endian::write<uint16_t>(OS, F.getAlignFill(), Endian);
+ break;
+ case 4:
+ support::endian::write<uint32_t>(OS, F.getAlignFill(), Endian);
+ break;
+ case 8:
+ support::endian::write<uint64_t>(OS, F.getAlignFill(), Endian);
+ break;
+ }
}
}
- break;
- }
+ } break;
case MCFragment::FT_Fill: {
++stats::EmittedFillFragments;
@@ -585,42 +559,45 @@ void MCAssembler::writeSectionData(raw_ostream &OS,
const MCSection *Sec) const {
assert(getBackendPtr() && "Expected assembler backend");
- // Ignore virtual sections.
- if (Sec->isVirtualSection()) {
+ if (Sec->isBssSection()) {
assert(getSectionFileSize(*Sec) == 0 && "Invalid size for section!");
- // Check that contents are only things legal inside a virtual section.
+ // Ensure no fixups or non-zero bytes are written to BSS sections, catching
+ // errors in both input assembly code and MCStreamer API usage. Location is
+ // not tracked for efficiency.
+ auto Fn = [](char c) { return c != 0; };
for (const MCFragment &F : *Sec) {
+ bool HasNonZero = false;
switch (F.getKind()) {
- default: llvm_unreachable("Invalid fragment in virtual section!");
- case MCFragment::FT_Data: {
- // Check that we aren't trying to write a non-zero contents (or fixups)
- // into a virtual section. This is to support clients which use standard
- // directives to fill the contents of virtual sections.
- if (F.getFixups().size() || F.getVarFixups().size())
- reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
- Sec->getName() + "' cannot have fixups");
- for (char C : F.getContents())
- if (C) {
- reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
- Sec->getName() +
- "' cannot have non-zero initializers");
- break;
- }
+ default:
+ reportFatalInternalError("BSS section '" + Sec->getName() +
+ "' contains invalid fragment");
+ break;
+ case MCFragment::FT_Data:
+ case MCFragment::FT_Relaxable:
+ HasNonZero =
+ any_of(F.getContents(), Fn) || any_of(F.getVarContents(), Fn);
break;
- }
case MCFragment::FT_Align:
- // Check that we aren't trying to write a non-zero value into a virtual
- // section.
- assert((cast<MCAlignFragment>(F).getFillLen() == 0 ||
- cast<MCAlignFragment>(F).getFill() == 0) &&
- "Invalid align in virtual section!");
+ // Disallowed for API usage. AsmParser changes non-zero fill values to
+ // 0.
+ assert(F.getAlignFill() == 0 && "Invalid align in virtual section!");
break;
case MCFragment::FT_Fill:
- assert((cast<MCFillFragment>(F).getValue() == 0) &&
- "Invalid fill in virtual section!");
+ HasNonZero = cast<MCFillFragment>(F).getValue() != 0;
break;
case MCFragment::FT_Org:
+ HasNonZero = cast<MCOrgFragment>(F).getValue() != 0;
+ break;
+ }
+ if (HasNonZero) {
+ reportError(SMLoc(), "BSS section '" + Sec->getName() +
+ "' cannot have non-zero bytes");
+ break;
+ }
+ if (F.getFixups().size() || F.getVarFixups().size()) {
+ reportError(SMLoc(),
+ "BSS section '" + Sec->getName() + "' cannot have fixups");
break;
}
}
@@ -724,34 +701,25 @@ void MCAssembler::layout() {
for (MCSection &Sec : *this) {
for (MCFragment &F : Sec) {
// Process fragments with fixups here.
- if (F.isEncoded()) {
- auto Contents = F.getContents();
- for (MCFixup &Fixup : F.getFixups()) {
+ auto Contents = F.getContents();
+ for (MCFixup &Fixup : F.getFixups()) {
+ uint64_t FixedValue;
+ MCValue Target;
+ evaluateFixup(F, Fixup, Target, FixedValue,
+ /*RecordReloc=*/true, Contents);
+ }
+ if (F.getVarFixups().size()) {
+ // In the variable part, fixup offsets are relative to the fixed part's
+ // start. Extend the variable contents to the left to account for the
+ // fixed part size.
+ Contents = MutableArrayRef(F.getParent()->ContentStorage)
+ .slice(F.VarContentStart - Contents.size(), F.getSize());
+ for (MCFixup &Fixup : F.getVarFixups()) {
uint64_t FixedValue;
MCValue Target;
evaluateFixup(F, Fixup, Target, FixedValue,
/*RecordReloc=*/true, Contents);
}
- // In the variable part, fixup offsets are relative to the fixed part's
- // start. Extend the variable contents to the left to account for the
- // fixed part size.
- auto VarFixups = F.getVarFixups();
- if (VarFixups.size()) {
- Contents =
- MutableArrayRef(F.getParent()->ContentStorage)
- .slice(F.VarContentStart - Contents.size(), F.getSize());
- for (MCFixup &Fixup : VarFixups) {
- uint64_t FixedValue;
- MCValue Target;
- evaluateFixup(F, Fixup, Target, FixedValue,
- /*RecordReloc=*/true, Contents);
- }
- }
- } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) {
- // For RISC-V linker relaxation, an alignment relocation might be
- // needed.
- if (AF->hasEmitNops())
- getBackend().shouldInsertFixupForCodeAlign(*this, *AF);
}
}
}
@@ -955,15 +923,15 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) {
}
bool MCAssembler::relaxCVInlineLineTable(MCCVInlineLineTableFragment &F) {
- unsigned OldSize = F.getContents().size();
+ unsigned OldSize = F.getVarContents().size();
getContext().getCVContext().encodeInlineLineTable(*this, F);
- return OldSize != F.getContents().size();
+ return OldSize != F.getVarContents().size();
}
bool MCAssembler::relaxCVDefRange(MCCVDefRangeFragment &F) {
- unsigned OldSize = F.getContents().size();
+ unsigned OldSize = F.getVarContents().size();
getContext().getCVContext().encodeDefRange(*this, F);
- return OldSize != F.getContents().size();
+ return OldSize != F.getVarContents().size();
}
bool MCAssembler::relaxFill(MCFillFragment &F) {
@@ -974,22 +942,6 @@ bool MCAssembler::relaxFill(MCFillFragment &F) {
return true;
}
-bool MCAssembler::relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &PF) {
- uint64_t OldSize = PF.getContents().size();
- int64_t AddrDelta;
- bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *this);
- assert(Abs && "We created a pseudo probe with an invalid expression");
- (void)Abs;
- SmallVector<char, 8> Data;
- raw_svector_ostream OSE(Data);
-
- // AddrDelta is a signed integer
- encodeSLEB128(AddrDelta, OSE, OldSize);
- PF.setContents(Data);
- PF.clearFixups();
- return OldSize != Data.size();
-}
-
bool MCAssembler::relaxFragment(MCFragment &F) {
switch(F.getKind()) {
default:
@@ -1011,8 +963,6 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
return relaxCVDefRange(cast<MCCVDefRangeFragment>(F));
case MCFragment::FT_Fill:
return relaxFill(cast<MCFillFragment>(F));
- case MCFragment::FT_PseudoProbe:
- return relaxPseudoProbeAddr(cast<MCPseudoProbeAddrFragment>(F));
}
}
@@ -1020,7 +970,32 @@ void MCAssembler::layoutSection(MCSection &Sec) {
uint64_t Offset = 0;
for (MCFragment &F : Sec) {
F.Offset = Offset;
- Offset += computeFragmentSize(F);
+ if (F.getKind() == MCFragment::FT_Align) {
+ Offset += F.getFixedSize();
+ unsigned Size = offsetToAlignment(Offset, F.getAlignment());
+ // In the nops mode, RISC-V style linker relaxation might adjust the size
+ // and add a fixup, even if `Size` is originally 0.
+ bool AlignFixup = false;
+ if (F.hasAlignEmitNops()) {
+ AlignFixup = getBackend().relaxAlign(F, Size);
+ // If the backend does not handle the fragment specially, pad with nops,
+ // but ensure that the padding is larger than the minimum nop size.
+ if (!AlignFixup)
+ while (Size % getBackend().getMinimumNopSize())
+ Size += F.getAlignment().value();
+ }
+ if (!AlignFixup && Size > F.getAlignMaxBytesToEmit())
+ Size = 0;
+ // Update the variable tail size, offset by FixedSize to prevent ubsan
+ // pointer-overflow in evaluateFixup. The content is ignored.
+ F.VarContentStart = F.getFixedSize();
+ F.VarContentEnd = F.VarContentStart + Size;
+ if (F.VarContentEnd > F.getParent()->ContentStorage.size())
+ F.getParent()->ContentStorage.resize(F.VarContentEnd);
+ Offset += Size;
+ } else {
+ Offset += computeFragmentSize(F);
+ }
}
}
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 1f98251..7d528a5 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -26,8 +26,10 @@ using namespace llvm;
using namespace llvm::codeview;
void CodeViewContext::finish() {
- if (StrTabFragment)
- StrTabFragment->setContents(StrTab);
+ if (!StrTabFragment)
+ return;
+ assert(StrTabFragment->getKind() == MCFragment::FT_Data);
+ StrTabFragment->setVarContents(StrTab);
}
/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
@@ -166,8 +168,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
// somewhere else. If somebody wants two string tables in their .s file, one
// will just be empty.
if (!StrTabFragment) {
- StrTabFragment = Ctx.allocFragment<MCFragment>();
- OS.insert(StrTabFragment);
+ OS.newFragment();
+ StrTabFragment = OS.getCurrentFragment();
+ OS.newFragment();
}
OS.emitValueToAlignment(Align(4), 0);
@@ -603,7 +606,7 @@ void CodeViewContext::encodeInlineLineTable(const MCAssembler &Asm,
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
- Frag.setContents(Buffer);
+ Frag.setVarContents(Buffer);
}
void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
@@ -691,6 +694,6 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
}
}
- Frag.setContents(Contents);
- Frag.setFixups(Fixups);
+ Frag.setVarContents(Contents);
+ Frag.setVarFixups(Fixups);
}
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index b1dced7..e7c0d37 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -447,10 +447,17 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
}
if (HasAnySource) {
+ // From https://dwarfstd.org/issues/180201.1.html
+ // * The value is an empty null-terminated string if no source is available
+ StringRef Source = DwarfFile.Source.value_or(StringRef());
+ // * If the source is available but is an empty file then the value is a
+ // null-terminated single "\n".
+ if (DwarfFile.Source && DwarfFile.Source->empty())
+ Source = "\n";
if (LineStr)
- LineStr->emitRef(MCOS, DwarfFile.Source.value_or(StringRef()));
+ LineStr->emitRef(MCOS, Source);
else {
- MCOS->emitBytes(DwarfFile.Source.value_or(StringRef())); // Source and...
+ MCOS->emitBytes(Source); // Source and...
MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
}
}
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 49071bd..b8cbaea5 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -88,7 +88,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN)
getWriter().markGnuAbi();
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
Asm.registerSymbol(*Section->getBeginSymbol());
}
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 22dff49..dbb2fd1 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -370,7 +370,6 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
}
int64_t Num;
- unsigned Count;
if (DF) {
Displacement += DF->getContents().size();
} else if (F->getKind() == MCFragment::FT_Relaxable &&
@@ -379,11 +378,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
// After layout, during relocation generation, it can be treated as a
// data fragment.
Displacement += F->getSize();
- } else if (auto *AF = dyn_cast<MCAlignFragment>(F);
- AF && Layout && AF->hasEmitNops() &&
- !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
- *AF, Count)) {
- Displacement += Asm->computeFragmentSize(*AF);
+ } else if (F->getKind() == MCFragment::FT_Align && Layout &&
+ F->isLinkerRelaxable()) {
+ Displacement += Asm->computeFragmentSize(*F);
} else if (auto *FF = dyn_cast<MCFillFragment>(F);
FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
Displacement += Num * FF->getValueSize();
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index bfe045a..3c395e5 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -58,7 +58,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
case MCFragment::FT_SymbolId: OS << "SymbolId"; break;
case MCFragment::FT_CVInlineLines: OS << "CVInlineLineTable"; break;
case MCFragment::FT_CVDefRange: OS << "CVDefRangeTable"; break;
- case MCFragment::FT_PseudoProbe: OS << "PseudoProbe"; break;
// clang-format on
}
@@ -73,17 +72,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
};
switch (getKind()) {
- case MCFragment::FT_Align: {
- const auto *AF = cast<MCAlignFragment>(this);
- OS << " Align:" << AF->getAlignment().value() << " Fill:" << AF->getFill()
- << " FillLen:" << unsigned(AF->getFillLen())
- << " MaxBytesToEmit:" << AF->getMaxBytesToEmit();
- if (AF->hasEmitNops())
- OS << " Nops";
- break;
- }
case MCFragment::FT_Data:
case MCFragment::FT_Relaxable:
+ case MCFragment::FT_Align:
case MCFragment::FT_LEB:
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame: {
@@ -92,8 +83,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
auto Fixed = getContents();
auto Var = getVarContents();
OS << " Size:" << Fixed.size();
- if (getKind() != MCFragment::FT_Data)
+ if (getKind() != MCFragment::FT_Data) {
OS << '+' << Var.size();
+ // FT_Align uses getVarContents to track the size, but the content is
+ // ignored and not useful.
+ if (getKind() == MCFragment::FT_Align)
+ Var = {};
+ }
OS << " [";
for (unsigned i = 0, e = Fixed.size(); i != e; ++i) {
if (i) OS << ",";
@@ -112,6 +108,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
OS << ' ';
getInst().dump_pretty(OS);
break;
+ case MCFragment::FT_Align:
+ OS << "\n Align:" << getAlignment().value() << " Fill:" << getAlignFill()
+ << " FillLen:" << unsigned(getAlignFillLen())
+ << " MaxBytesToEmit:" << getAlignMaxBytesToEmit();
+ if (hasAlignEmitNops())
+ OS << " Nops";
+ break;
case MCFragment::FT_LEB: {
OS << " Value:";
getLEBValue().print(OS, nullptr);
@@ -182,12 +185,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
}
break;
}
- case MCFragment::FT_PseudoProbe: {
- const auto *OF = cast<MCPseudoProbeAddrFragment>(this);
- OS << " AddrDelta:";
- OF->getAddrDelta().print(OS, nullptr);
- break;
- }
}
}
#endif
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 43598ef..7560399 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -132,8 +132,7 @@ public:
} // end anonymous namespace.
void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- // Change the section normally.
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
// Output a linker-local symbol so we don't need section-relative local
// relocations. The linker hates us when we do that.
@@ -161,7 +160,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// We have to create a new fragment if this is an atom defining symbol,
// fragments cannot span atoms.
if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible())
- insert(getContext().allocFragment<MCFragment>());
+ newFragment();
MCObjectStreamer::emitLabel(Symbol, Loc);
@@ -393,7 +392,7 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
// On darwin all virtual sections have zerofill type. Disallow the usage of
// .zerofill in non-virtual functions. If something similar is needed, use
// .space or .zero.
- if (!Section->isVirtualSection()) {
+ if (!Section->isBssSection()) {
getContext().reportError(
Loc, "The usage of .zerofill is restricted to sections of "
"ZEROFILL type. Use .zero or .space instead.");
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d5b8f22..42f4cf4 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -33,6 +33,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
Context, std::move(TAB), std::move(Emitter), std::move(OW))),
EmitEHFrame(true), EmitDebugFrame(false) {
assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
+ IsObj = true;
setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
Assembler->setRelaxAll(true);
@@ -46,6 +47,25 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
return nullptr;
}
+void MCObjectStreamer::newFragment() {
+ addFragment(getContext().allocFragment<MCFragment>());
+}
+
+void MCObjectStreamer::insert(MCFragment *F) {
+ assert(F->getKind() != MCFragment::FT_Data &&
+ "F should have a variable-size tail");
+ addFragment(F);
+ newFragment();
+}
+
+void MCObjectStreamer::appendContents(size_t Num, char Elt) {
+ CurFrag->appendContents(Num, Elt);
+}
+
+void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
+ CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+}
+
// As a compile-time optimization, avoid allocating and evaluating an MCExpr
// tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed
// part.
@@ -106,18 +126,6 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
MCDwarfFrameEmitter::Emit(*this, MAB, false);
}
-MCFragment *MCObjectStreamer::getOrCreateDataFragment() {
- // TODO: Start a new fragment whenever finalizing the variable-size tail of a
- // previous one, so that all getOrCreateDataFragment calls can be replaced
- // with getCurrentFragment
- auto *F = getCurrentFragment();
- if (F->getKind() != MCFragment::FT_Data) {
- F = getContext().allocFragment<MCFragment>();
- insert(F);
- }
- return F;
-}
-
void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
Assembler->registerSymbol(Sym);
}
@@ -131,7 +139,7 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
SMLoc Loc) {
MCStreamer::emitValueImpl(Value, Size, Loc);
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -180,7 +188,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// If there is a current fragment, mark the symbol as pointing into it.
// Otherwise queue the label and set its fragment pointer when we emit the
// next fragment.
- MCFragment *F = getOrCreateDataFragment();
+ MCFragment *F = getCurrentFragment();
Symbol->setFragment(F);
Symbol->setOffset(F->getContents().size());
@@ -214,10 +222,9 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
emitULEB128IntValue(IntValue);
return;
}
- auto *F = getOrCreateDataFragment();
- F->Kind = MCFragment::FT_LEB;
- F->setLEBSigned(false);
- F->setLEBValue(Value);
+ auto *F = getCurrentFragment();
+ F->makeLEB(false, Value);
+ newFragment();
}
void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
@@ -226,10 +233,9 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
emitSLEB128IntValue(IntValue);
return;
}
- auto *F = getOrCreateDataFragment();
- F->Kind = MCFragment::FT_LEB;
- F->setLEBSigned(true);
- F->setLEBValue(Value);
+ auto *F = getCurrentFragment();
+ F->makeLEB(true, Value);
+ newFragment();
}
void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
@@ -238,11 +244,6 @@ void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
}
void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- changeSectionImpl(Section, Subsection);
-}
-
-bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
- uint32_t Subsection) {
assert(Section && "Cannot switch to a null section!");
getContext().clearDwarfLocSeen();
@@ -261,7 +262,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
Section->CurFragList = &Subsections[I].second;
CurFrag = Section->CurFragList->Tail;
- return getAssembler().registerSection(*Section);
+ getAssembler().registerSection(*Section);
}
void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
@@ -293,18 +294,6 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
void MCObjectStreamer::emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- const MCSection &Sec = *getCurrentSectionOnly();
- if (Sec.isVirtualSection()) {
- getContext().reportError(Inst.getLoc(), Twine(Sec.getVirtualSectionKind()) +
- " section '" + Sec.getName() +
- "' cannot have instructions");
- return;
- }
- emitInstructionImpl(Inst, STI);
-}
-
-void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
- const MCSubtargetInfo &STI) {
MCStreamer::emitInstruction(Inst, STI);
MCSection *Sec = getCurrentSectionOnly();
@@ -338,7 +327,7 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
void MCObjectStreamer::emitInstToData(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- MCFragment *F = getOrCreateDataFragment();
+ MCFragment *F = getCurrentFragment();
// Append the instruction to the data fragment.
size_t FixupStartIndex = F->getFixups().size();
@@ -370,7 +359,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
SmallVector<char, 16> Data;
SmallVector<MCFixup, 1> Fixups;
getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
@@ -381,6 +370,7 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
F->setVarContents(Data);
F->setVarFixups(Fixups);
F->setInst(Inst);
+ newFragment();
}
void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
@@ -442,10 +432,11 @@ void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
return;
}
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->Kind = MCFragment::FT_Dwarf;
F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, SMLoc()));
F->setDwarfLineDelta(LineDelta);
+ newFragment();
}
void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
@@ -473,9 +464,10 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
const MCSymbol *Label,
SMLoc Loc) {
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->Kind = MCFragment::FT_DwarfFrame;
F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, Loc));
+ newFragment();
}
void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
@@ -534,7 +526,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
void MCObjectStreamer::emitBytes(StringRef Data) {
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
DF->appendContents(ArrayRef(Data.data(), Data.size()));
}
@@ -543,28 +535,21 @@ void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
unsigned MaxBytesToEmit) {
if (MaxBytesToEmit == 0)
MaxBytesToEmit = Alignment.value();
- insert(getContext().allocFragment<MCAlignFragment>(Alignment, Fill, FillLen,
- MaxBytesToEmit));
+ MCFragment *F = getCurrentFragment();
+ F->makeAlign(Alignment, Fill, FillLen, MaxBytesToEmit);
+ newFragment();
// Update the maximum alignment on the current section if necessary.
- MCSection *CurSec = getCurrentSectionOnly();
- CurSec->ensureMinAlignment(Alignment);
+ F->getParent()->ensureMinAlignment(Alignment);
}
void MCObjectStreamer::emitCodeAlignment(Align Alignment,
const MCSubtargetInfo *STI,
unsigned MaxBytesToEmit) {
+ auto *F = getCurrentFragment();
emitValueToAlignment(Alignment, 0, 1, MaxBytesToEmit);
- auto *F = cast<MCAlignFragment>(getCurrentFragment());
- F->setEmitNops(true, STI);
- // With RISC-V style linker relaxation, mark the section as linker-relaxable
- // if the alignment is larger than the minimum NOP size.
- unsigned Size;
- if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F,
- Size)) {
- getCurrentSectionOnly()->setLinkerRelaxable();
- newFragment();
- }
+ F->u.align.EmitNops = true;
+ F->STI = STI;
}
void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 77bf843..d0b6ea4 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3404,11 +3404,10 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) {
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
- if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) {
+ if (HasFillExpr && FillExpr != 0 && Section->isBssSection()) {
ReturnVal |=
- Warning(FillExprLoc, "ignoring non-zero fill value in " +
- Section->getVirtualSectionKind() +
- " section '" + Section->getName() + "'");
+ Warning(FillExprLoc, "ignoring non-zero fill value in BSS section '" +
+ Section->getName() + "'");
FillExpr = 0;
}
diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 7f09349..d7b0546 100644
--- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -8,8 +8,8 @@
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCStreamer.h"
using namespace llvm;
@@ -25,8 +25,9 @@ MCSubtargetInfo &MCTargetAsmParser::copySTI() {
STI = &STICopy;
// The returned STI will likely be modified. Create a new fragment to prevent
// mixing STI values within a fragment.
- if (getStreamer().getCurrentFragment())
- getStreamer().newFragment();
+ auto &S = getStreamer();
+ if (S.isObj() && S.getCurrentFragment())
+ static_cast<MCObjectStreamer &>(S).newFragment();
return STICopy;
}
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index f87d27f..b493337 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -81,8 +81,9 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
if (AddrDelta->evaluateAsAbsolute(Delta, MCOS->getAssemblerPtr())) {
MCOS->emitSLEB128IntValue(Delta);
} else {
- MCOS->insert(MCOS->getContext().allocFragment<MCPseudoProbeAddrFragment>(
- AddrDelta));
+ auto *F = MCOS->getCurrentFragment();
+ F->makeLEB(true, AddrDelta);
+ MCOS->newFragment();
}
} else {
// Emit the GUID of the split function that the sentinel probe represents.
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 9367145..023f7f2 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -18,10 +18,10 @@
using namespace llvm;
-MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
- bool IsVirtual, MCSymbol *Begin)
+MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
+ MCSymbol *Begin)
: Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
- IsVirtual(IsVirtual), LinkerRelaxable(false), Name(Name), Variant(V) {
+ IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) {
// The initial subsection number is 0. Create a fragment list.
CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
}
@@ -34,8 +34,6 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
bool MCSection::hasEnded() const { return End && End->isInSection(); }
-StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void MCSection::dump(
DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>> *FragToSyms)
@@ -60,16 +58,6 @@ LLVM_DUMP_METHOD void MCSection::dump(
}
#endif
-void MCFragment::setContents(ArrayRef<char> Contents) {
- auto &S = getParent()->ContentStorage;
- if (ContentStart + Contents.size() > ContentEnd) {
- ContentStart = S.size();
- S.resize_for_overwrite(S.size() + Contents.size());
- }
- ContentEnd = ContentStart + Contents.size();
- llvm::copy(Contents, S.begin() + ContentStart);
-}
-
void MCFragment::setVarContents(ArrayRef<char> Contents) {
auto &S = getParent()->ContentStorage;
if (VarContentStart + Contents.size() > VarContentEnd) {
@@ -96,16 +84,6 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
FixupEnd = S.size();
}
-void MCFragment::setFixups(ArrayRef<MCFixup> Fixups) {
- auto &S = getParent()->FixupStorage;
- if (FixupStart + Fixups.size() > FixupEnd) {
- FixupStart = S.size();
- S.resize_for_overwrite(S.size() + Fixups.size());
- }
- FixupEnd = FixupStart + Fixups.size();
- llvm::copy(Fixups, S.begin() + FixupStart);
-}
-
void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) {
auto &S = getParent()->FixupStorage;
if (VarFixupStart + Fixups.size() > VarFixupEnd) {
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index 94e29ce..5bf1473 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -115,7 +115,3 @@ void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
}
bool MCSectionCOFF::useCodeAlign() const { return isText(); }
-
-StringRef MCSectionCOFF::getVirtualSectionKind() const {
- return "IMAGE_SCN_CNT_UNINITIALIZED_DATA";
-}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index 299fe40..ef33f9c 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -215,5 +215,3 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
bool MCSectionELF::useCodeAlign() const {
return getFlags() & ELF::SHF_EXECINSTR;
}
-
-StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index c3ecf8f..30198c9 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1404,7 +1404,7 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
return Sym;
}
-void MCStreamer::insert(MCFragment *F) {
+void MCStreamer::addFragment(MCFragment *F) {
auto *Sec = CurFrag->getParent();
F->setParent(Sec);
F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
@@ -1413,10 +1413,6 @@ void MCStreamer::insert(MCFragment *F) {
Sec->curFragList()->Tail = F;
}
-void MCStreamer::newFragment() {
- insert(getContext().allocFragment<MCFragment>());
-}
-
static VersionTuple
targetVersionOrMinimumSupportedOSVersion(const Triple &Target,
VersionTuple TargetVersion) {
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index e8b26bf..72a8dd7 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,15 +318,13 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// Emit the epilog instructions.
if (EnableUnwindV2) {
- MCFragment *DF = OS->getOrCreateDataFragment();
-
bool IsLast = true;
for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
if (IsLast) {
IsLast = false;
uint8_t Flags = LastEpilogIsAtEnd ? 0x01 : 0;
- streamer.emitInt8(EpilogSize);
- streamer.emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
+ OS->emitInt8(EpilogSize);
+ OS->emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
if (LastEpilogIsAtEnd)
continue;
@@ -337,9 +335,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// layout has been completed.
auto *MCE = MCUnwindV2EpilogTargetExpr::create(*info, Epilog.second,
EpilogSize, context);
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_2);
- DF->addFixup(Fixup);
- DF->appendContents(2, 0);
+ OS->addFixup(MCE, FK_Data_2);
+ OS->appendContents(2, 0);
}
}
if (AddPaddingEpilogCode)
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 3398775..9369bea 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -153,7 +153,7 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack,
}
void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
// Ensure that the first and the second symbols relative to the section are
// the section symbol and the COMDAT symbol.
getAssembler().registerSymbol(*Section->getBeginSymbol());
@@ -278,35 +278,28 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
- DF->addFixup(Fixup);
- DF->appendContents(2, 0);
+ addFixup(SRE, FK_SecRel_2);
+ appendContents(2, 0);
}
void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
uint64_t Offset) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol A for the relocation relative reference.
const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
// Add the constant offset, if given.
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
- // Build the secrel32 relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_SecRel_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_SecRel_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
int64_t Offset) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol A for the relocation relative reference.
const MCExpr *MCE = MCSymbolRefExpr::create(
Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
@@ -314,40 +307,29 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
- // Build the imgrel relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol for section number.
const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
*Symbol, this->getWriter(), getContext());
- // Build the relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol for section offset.
const MCExpr *MCE =
MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
- // Build the relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 4d45296..63381b4 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -89,7 +89,7 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
// Add a Fixup here to later record a relocation of type R_REF to prevent the
// ref symbol from being garbage collected (by the binder).
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
std::optional<MCFixupKind> MaybeKind =
getAssembler().getBackend().getFixupKind("R_REF");
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 3291dd7..48d2fc6 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -131,7 +131,7 @@ uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm,
return 0;
const MCSection &NextSec = *SectionOrder[Next];
- if (NextSec.isVirtualSection())
+ if (NextSec.isBssSection())
return 0;
return offsetToAlignment(EndAddr, NextSec.getAlign());
}
@@ -267,7 +267,7 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
const MCSectionMachO &Section = cast<MCSectionMachO>(Sec);
// The offset is unused for virtual sections.
- if (Section.isVirtualSection()) {
+ if (Section.isBssSection()) {
assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!");
FileOffset = 0;
}
@@ -682,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) {
unsigned i = 0;
// Compute the section layout order. Virtual sections must go last.
for (MCSection &Sec : Asm) {
- if (!Sec.isVirtualSection()) {
+ if (!Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
}
}
for (MCSection &Sec : Asm) {
- if (Sec.isVirtualSection()) {
+ if (Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
}
@@ -797,11 +797,8 @@ uint64_t MachObjectWriter::writeObject() {
UndefinedSymbolData);
if (!CGProfile.empty()) {
- MCSection *CGProfileSection = getContext().getMachOSection(
- "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
- auto &Frag = *CGProfileSection->begin();
- Frag.clearContents();
- raw_svector_ostream OS(Frag.getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -809,7 +806,9 @@ uint64_t MachObjectWriter::writeObject() {
support::endian::write(OS, ToIndex, W.Endian);
support::endian::write(OS, CGPE.Count, W.Endian);
}
- Frag.doneAppending();
+ MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
+ SectionKind::getMetadata());
+ llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
}
unsigned NumSections = Asm.end() - Asm.begin();
@@ -883,7 +882,7 @@ uint64_t MachObjectWriter::writeObject() {
VMSize = std::max(VMSize, Address + Size);
- if (Sec.isVirtualSection())
+ if (Sec.isBssSection())
continue;
SectionDataSize = std::max(SectionDataSize, Address + Size);
@@ -915,7 +914,7 @@ uint64_t MachObjectWriter::writeObject() {
unsigned Flags = Sec.getTypeAndAttributes();
if (Sec.hasInstructions())
Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS;
- if (!cast<MCSectionMachO>(Sec).isVirtualSection() &&
+ if (!cast<MCSectionMachO>(Sec).isBssSection() &&
!isUInt<32>(SectionStart)) {
getContext().reportError(
SMLoc(), "cannot encode offset of section; object file too large");
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 7af240a..3b99af4 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -696,14 +696,15 @@ static void addData(SmallVectorImpl<char> &DataBytes,
if (Frag.hasInstructions())
report_fatal_error("only data supported in data sections");
- if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) {
- if (Align->getFillLen() != 1)
+ llvm::append_range(DataBytes, Frag.getContents());
+ if (Frag.getKind() == MCFragment::FT_Align) {
+ if (Frag.getAlignFillLen() != 1)
report_fatal_error("only byte values supported for alignment");
// If nops are requested, use zeros, as this is the data section.
- uint8_t Value = Align->hasEmitNops() ? 0 : Align->getFill();
+ uint8_t Value = Frag.hasAlignEmitNops() ? 0 : Frag.getAlignFill();
uint64_t Size =
- std::min<uint64_t>(alignTo(DataBytes.size(), Align->getAlignment()),
- DataBytes.size() + Align->getMaxBytesToEmit());
+ std::min<uint64_t>(alignTo(DataBytes.size(), Frag.getAlignment()),
+ DataBytes.size() + Frag.getAlignMaxBytesToEmit());
DataBytes.resize(Size, Value);
} else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
int64_t NumValues;
@@ -711,12 +712,10 @@ static void addData(SmallVectorImpl<char> &DataBytes,
llvm_unreachable("The fill should be an assembler constant");
DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
Fill->getValue());
+ } else if (Frag.getKind() == MCFragment::FT_LEB) {
+ llvm::append_range(DataBytes, Frag.getVarContents());
} else {
- llvm::append_range(DataBytes, Frag.getContents());
- if (Frag.getKind() == MCFragment::FT_LEB)
- llvm::append_range(DataBytes, Frag.getVarContents());
- else
- assert(Frag.getKind() == MCFragment::FT_Data);
+ assert(Frag.getKind() == MCFragment::FT_Data);
}
}
@@ -1858,23 +1857,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
auto IT = WS.begin();
if (IT == WS.end())
continue;
- const MCFragment &EmptyFrag = *IT;
- if (EmptyFrag.getKind() != MCFragment::FT_Data)
- report_fatal_error(".init_array section should be aligned");
-
- const MCFragment *nextFrag = EmptyFrag.getNext();
- while (nextFrag != nullptr) {
- const MCFragment &AlignFrag = *nextFrag;
- if (AlignFrag.getKind() != MCFragment::FT_Align)
- report_fatal_error(".init_array section should be aligned");
- if (cast<MCAlignFragment>(AlignFrag).getAlignment() !=
- Align(is64Bit() ? 8 : 4))
- report_fatal_error(
- ".init_array section should be aligned for pointers");
-
- const MCFragment &Frag = *AlignFrag.getNext();
- nextFrag = Frag.getNext();
- if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+ for (auto *Frag = &*IT; Frag; Frag = Frag->getNext()) {
+ if (Frag->hasInstructions() || (Frag->getKind() != MCFragment::FT_Align &&
+ Frag->getKind() != MCFragment::FT_Data))
report_fatal_error("only data supported in .init_array section");
uint16_t Priority = UINT16_MAX;
@@ -1886,9 +1871,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority))
report_fatal_error("invalid .init_array section priority");
}
- const auto &DataFrag = Frag;
- assert(llvm::all_of(DataFrag.getContents(), [](char C) { return !C; }));
- for (const MCFixup &Fixup : DataFrag.getFixups()) {
+ assert(llvm::all_of(Frag->getContents(), [](char C) { return !C; }));
+ for (const MCFixup &Fixup : Frag->getFixups()) {
assert(Fixup.getKind() ==
MCFixup::getDataKindForSize(is64Bit() ? 8 : 4));
const MCExpr *Expr = Fixup.getValue();
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index ee4d957..6ad4334 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -179,7 +179,7 @@ private:
void SetSymbolName(COFFSymbol &S);
void SetSectionName(COFFSection &S);
- bool IsPhysicalSection(COFFSection *S);
+ bool isUninitializedData(const COFFSection &S);
// Entity writing methods.
void WriteFileHeader(const COFF::header &Header);
@@ -453,8 +453,8 @@ void WinCOFFWriter::SetSymbolName(COFFSymbol &S) {
std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
}
-bool WinCOFFWriter::IsPhysicalSection(COFFSection *S) {
- return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
+bool WinCOFFWriter::isUninitializedData(const COFFSection &S) {
+ return (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) !=
0;
}
@@ -606,6 +606,9 @@ void WinCOFFWriter::writeSection(const COFFSection &Sec) {
assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
AuxSymbol &SecDef = AuxSyms[0];
SecDef.Aux.SectionDefinition.CheckSum = CRC;
+ } else if (isUninitializedData(Sec)) {
+ // Error if fixups or non-zero bytes are present.
+ writeSectionContents(*Sec.MCSection);
}
// Write relocations for this section.
@@ -745,7 +748,7 @@ void WinCOFFWriter::assignFileOffsets() {
Sec->Header.SizeOfRawData = Asm->getSectionAddressSize(Section);
- if (IsPhysicalSection(Sec)) {
+ if (!isUninitializedData(*Sec)) {
Sec->Header.PointerToRawData = Offset;
Offset += Sec->Header.SizeOfRawData;
}
@@ -1067,10 +1070,8 @@ uint64_t WinCOFFWriter::writeObject() {
// Create the contents of the .llvm_addrsig section.
if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) {
- auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
- COFF::IMAGE_SCN_LNK_REMOVE);
- auto *Frag = Sec->curFragList()->Head;
- raw_svector_ostream OS(Frag->getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const MCSymbol *S : OWriter.AddrsigSyms) {
if (!S->isRegistered())
continue;
@@ -1085,15 +1086,15 @@ uint64_t WinCOFFWriter::writeObject() {
"executePostLayoutBinding!");
encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS);
}
- Frag->doneAppending();
+ auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
+ COFF::IMAGE_SCN_LNK_REMOVE);
+ Sec->curFragList()->Tail->setVarContents(OS.str());
}
// Create the contents of the .llvm.call-graph-profile section.
if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
- auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
- COFF::IMAGE_SCN_LNK_REMOVE);
- auto *Frag = Sec->curFragList()->Head;
- raw_svector_ostream OS(Frag->getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const auto &CGPE : OWriter.getCGProfile()) {
uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -1101,7 +1102,9 @@ uint64_t WinCOFFWriter::writeObject() {
support::endian::write(OS, ToIndex, W.Endian);
support::endian::write(OS, CGPE.Count, W.Endian);
}
- Frag->doneAppending();
+ auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
+ COFF::IMAGE_SCN_LNK_REMOVE);
+ Sec->curFragList()->Tail->setVarContents(OS.str());
}
assignFileOffsets();
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 8f9444f..86c6b12 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -64,14 +64,14 @@ struct Section {
return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
}
- bool isVirtualSection() const {
+ bool isBssSection() const {
return (getType() == MachO::S_ZEROFILL ||
getType() == MachO::S_GB_ZEROFILL ||
getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
}
bool hasValidOffset() const {
- return !(isVirtualSection() || OriginalOffset == 0);
+ return !(isBssSection() || OriginalOffset == 0);
}
};
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
index 7c24d12..89c1df8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -112,7 +112,7 @@ size_t MachOWriter::totalSize() const {
for (const std::unique_ptr<Section> &S : LC.Sections) {
if (!S->hasValidOffset()) {
assert((S->Offset == 0) && "Skipped section's offset must be zero");
- assert((S->isVirtualSection() || S->Size == 0) &&
+ assert((S->isBssSection() || S->Size == 0) &&
"Non-zero-fill sections with zero offset must have zero size");
continue;
}
@@ -240,7 +240,7 @@ void MachOWriter::writeSections() {
for (const std::unique_ptr<Section> &Sec : LC.Sections) {
if (!Sec->hasValidOffset()) {
assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
- assert((Sec->isVirtualSection() || Sec->Size == 0) &&
+ assert((Sec->isBssSection() || Sec->Size == 0) &&
"Non-zero-fill sections with zero offset must have zero size");
continue;
}
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 870169a..0f6d2f7 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMObject
OffloadBundle.cpp
RecordStreamer.cpp
RelocationResolver.cpp
+ SFrameParser.cpp
SymbolicFile.cpp
SymbolSize.cpp
TapiFile.cpp
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 5597d7d..0919c6a 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -620,7 +620,9 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
StringRef ELFObjectFileBase::getNVPTXCPUName() const {
assert(getEMachine() == ELF::EM_CUDA);
- unsigned SM = getPlatformFlags() & ELF::EF_CUDA_SM;
+ unsigned SM = getEIdentABIVersion() == ELF::ELFABIVERSION_CUDA_V1
+ ? getPlatformFlags() & ELF::EF_CUDA_SM
+ : getPlatformFlags() & ELF::EF_CUDA_SM_MASK;
switch (SM) {
// Fermi architecture.
@@ -679,7 +681,18 @@ StringRef ELFObjectFileBase::getNVPTXCPUName() const {
// Hopper architecture.
case ELF::EF_CUDA_SM90:
- return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_90a" : "sm_90";
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS_V1 ? "sm_90a"
+ : "sm_90";
+
+ // Blackwell architecture.
+ case ELF::EF_CUDA_SM100:
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_100a"
+ : "sm_100";
+
+ // Rubin architecture.
+ case ELF::EF_CUDA_SM120:
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_120a"
+ : "sm_120";
default:
llvm_unreachable("Unknown EF_CUDA_SM value");
}
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
new file mode 100644
index 0000000..2d74d1d
--- /dev/null
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -0,0 +1,55 @@
+//===- SFrameParser.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/SFrameParser.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+ uint64_t Offset) {
+ static_assert(std::is_trivial_v<T>);
+ if (Data.size() < Offset + sizeof(T)) {
+ return createStringError(
+ formatv("unexpected end of data at offset {0:x} while reading [{1:x}, "
+ "{2:x})",
+ Data.size(), Offset, Offset + sizeof(T))
+ .str(),
+ object_error::unexpected_eof);
+ }
+ return *reinterpret_cast<const T *>(Data.data() + Offset);
+}
+
+template <endianness E>
+Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
+ Expected<const sframe::Preamble<E> &> Preamble =
+ getDataSliceAs<sframe::Preamble<E>>(Contents, 0);
+ if (!Preamble)
+ return Preamble.takeError();
+
+ if (Preamble->Magic != sframe::Magic)
+ return createError(
+ formatv("invalid magic number ({0:x+4})", Preamble->Magic.value()));
+ if (Preamble->Version != sframe::Version::V2)
+ return createError(
+ formatv("invalid/unsupported version number ({0})",
+ static_cast<unsigned>(Preamble->Version.value())));
+
+ Expected<const sframe::Header<E> &> Header =
+ getDataSliceAs<sframe::Header<E>>(Contents, 0);
+ if (!Header)
+ return Header.takeError();
+ return SFrameParser(Contents, *Header);
+}
+
+template class llvm::object::SFrameParser<endianness::big>;
+template class llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 80fb52f..e15570c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1189,9 +1189,13 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
} else if (ParamName == "split-backedge-load-pre") {
Result.setLoadPRESplitBackedge(Enable);
} else if (ParamName == "memdep") {
+ // MemDep and MemorySSA are mutually exclusive.
Result.setMemDep(Enable);
+ Result.setMemorySSA(!Enable);
} else if (ParamName == "memoryssa") {
+ // MemDep and MemorySSA are mutually exclusive.
Result.setMemorySSA(Enable);
+ Result.setMemDep(!Enable);
} else {
return make_error<StringError>(
formatv("invalid GVN pass parameter '{}'", ParamName).str(),
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index a579eaf..10b6101 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -380,7 +380,7 @@ if(LLVM_WITH_Z3)
)
endif()
-target_include_directories(LLVMSupport SYSTEM
+target_include_directories(LLVMSupport
PRIVATE
${LLVM_THIRD_PARTY_DIR}/siphash/include
- )
+)
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index 432e1fc..3432dc1 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -45,23 +45,15 @@ static inline unsigned *getHashTable(StringMapEntryBase **TheTable,
uint32_t StringMapImpl::hash(StringRef Key) { return xxh3_64bits(Key); }
-StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
- ItemSize = itemSize;
-
+StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize)
+ : ItemSize(itemSize) {
// If a size is specified, initialize the table with that many buckets.
if (InitSize) {
// The table will grow when the number of entries reach 3/4 of the number of
// buckets. To guarantee that "InitSize" number of entries can be inserted
// in the table without growing, we allocate just what is needed here.
init(getMinBucketToReserveForEntries(InitSize));
- return;
}
-
- // Otherwise, initialize it with zero buckets to avoid the allocation.
- TheTable = nullptr;
- NumBuckets = 0;
- NumItems = 0;
- NumTombstones = 0;
}
void StringMapImpl::init(unsigned InitSize) {
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 12fc976..201bfe0 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1205,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Register DstReg = MI.getOperand(0).getReg();
if (DstReg == MI.getOperand(3).getReg()) {
// Expand to BIT
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
- : AArch64::BITv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(3))
- .add(MI.getOperand(2))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+ : AArch64::BITv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else if (DstReg == MI.getOperand(2).getReg()) {
// Expand to BIF
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
- : AArch64::BIFv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+ : AArch64::BIFv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else {
// Expand to BSL, use additional move if required
if (DstReg == MI.getOperand(1).getReg()) {
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I, I);
} else {
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
@@ -1240,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
getRenamableRegState(MI.getOperand(0).isRenamable()))
.add(MI.getOperand(1))
.add(MI.getOperand(1));
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .addReg(DstReg,
- RegState::Kill |
- getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill | getRenamableRegState(
+ MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I2, I2);
}
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c46b18..9f8a257 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1053,13 +1053,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>;
def AArch64saddlv : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>;
-def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abdu node:$lhs, node:$rhs),
- (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
-def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abds node:$lhs, node:$rhs),
- (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
-
// Add Pairwise of two vectors
def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
// Add Long Pairwise
@@ -5667,8 +5660,7 @@ let Predicates = [HasFullFP16] in {
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- AArch64uabd>;
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
// Match UABDL in log2-shuffle patterns.
def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))))),
@@ -6018,8 +6010,8 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
- TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
-defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+ TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
@@ -6037,8 +6029,8 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
- TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
-defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+ TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
@@ -6759,10 +6751,8 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
- AArch64sabd>;
-defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
- AArch64sabd>;
+defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
+defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
@@ -6780,8 +6770,7 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
- AArch64uabd>;
+defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddd17c..abcd550 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
@@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI);
+ bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
- unsigned Opc, MachineInstr &MI) {
+bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
+ unsigned OtherOpc) {
// Try below transformation.
//
- // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
- // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+ // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
@@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND(
return splitTwoPartImm<T>(
MI,
- [Opc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
+ [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
- return std::make_pair(Opc, Opc);
+ return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
@@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::ANDXrr:
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
break;
+ case AArch64::ANDSWrr:
+ Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ break;
+ case AArch64::ANDSXrr:
+ Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2409cc8..0f4f012 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
}
void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// LNT run (at least on Cyclone) showed reasonably significant gains for
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 154db3c..061ed61 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -343,7 +343,8 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
SDep &Dep,
const TargetSchedModel *SchedModel) const override;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 473ba5e..bb0f667b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.moreElementsToNextPow2(0)
.lower();
+ getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+ .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .lower();
+
getActionDefinitionsBuilder(
{G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
.legalFor({{s32, s32}, {s64, s32}})
@@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
return LowerBinOp(AArch64::G_UMULL);
+ case Intrinsic::aarch64_neon_sabd:
+ return LowerBinOp(TargetOpcode::G_ABDS);
+ case Intrinsic::aarch64_neon_uabd:
+ return LowerBinOp(TargetOpcode::G_ABDU);
case Intrinsic::aarch64_neon_abs: {
// Lower the intrinsic to G_ABS.
MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0e0e83b..6076ac4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1848,7 +1848,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureImageInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
- FeatureMemoryAtomicFAddF32DenormalSupport]>;
+ FeatureMemoryAtomicFAddF32DenormalSupport,
+ FeatureRealTrue16Insts]>;
// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
@@ -1868,8 +1869,7 @@ def FeatureISAVersion11_0_Common : FeatureSet<
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureMADIntraFwdBug,
- FeaturePrivEnabledTrap2NopBug,
- FeatureRealTrue16Insts])>;
+ FeaturePrivEnabledTrap2NopBug])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 749b9ef..4b3dc37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1415,6 +1415,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+ MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
if (AMDGPU::isCompute(CC)) {
MD->setHwStage(CC, ".trap_present",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 14101e5..3d8d274 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}
- unsigned ReturnOpc =
- IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+ const bool IsWholeWave = MFI->isWholeWaveFunction();
+ unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
+ : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
+ : AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);
if (!FLI.CanLowerReturn)
@@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
+ if (IsWholeWave)
+ addOriginalExecToReturn(B.getMF(), Ret);
+
// TODO: Handle CalleeSavedRegsViaCopy.
B.insertInstr(Ret);
@@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;
+ if (Info->isWholeWaveFunction() && Idx == 0) {
+ assert(VRegs[Idx].size() == 1 && "Expected only one register");
+
+ // The first argument for whole wave functions is the original EXEC value.
+ B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ .addDef(VRegs[Idx][0]);
+
+ ++Idx;
+ continue;
+ }
+
const bool InReg = Arg.hasAttribute(Attribute::InReg);
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
@@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return true;
}
+
+void AMDGPUCallLowering::addOriginalExecToReturn(
+ MachineFunction &MF, MachineInstrBuilder &Ret) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
+ Ret.addReg(Setup->getOperand(0).getReg());
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f..e0033d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+ void addOriginalExecToReturn(MachineFunction &MF,
+ MachineInstrBuilder &Ret) const;
+
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7b5d4077e..891d362 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_glc :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
+ GIComplexPatternEquiv<GlobalSAddrGLC>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -312,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
+def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
+// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
+// so we don't mark it as equivalent.
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 25672a5..3412bb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1872,6 +1872,23 @@ static SDValue matchZExtFromI32(SDValue Op) {
return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
}
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+ const SelectionDAG *DAG) {
+ if (Op.getValueType() == MVT::i32)
+ return Op;
+
+ if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+ Op.getOpcode() != ISD::ANY_EXTEND &&
+ !(DAG->SignBitIsZero(Op) &&
+ Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
+ return SDValue();
+
+ SDValue ExtSrc = Op.getOperand(0);
+ return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
+}
+
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
SDValue Addr,
@@ -1968,6 +1985,29 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ return false;
+
+ unsigned CPolVal = AMDGPU::CPol::GLC;
+ CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
@@ -2136,17 +2176,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
return true;
}
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+ bool IsSigned) const {
+ bool ScaleOffset = false;
+ if (!Subtarget->hasScaleOffset() || !Offset)
+ return false;
+
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+ SDValue Off = Offset;
+ if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+ Off = Ext;
+
+ if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Log2_32(Size);
+ } else if (Offset.getOpcode() == ISD::MUL ||
+ (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+ Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+ (Offset.isMachineOpcode() &&
+ Offset.getMachineOpcode() ==
+ (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+ : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Size;
+ }
+
+ if (ScaleOffset)
+ Offset = Off.getOperand(0);
+
+ return ScaleOffset;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+ bool HasSOffset, int64_t ImmOffset,
+ bool *ScaleOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
+ if (ScaleOffset) {
+ assert(N && SOffset);
+
+ *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+ }
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
if (!SOffset)
@@ -2231,24 +2313,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// Match a base and an immediate (if Offset is not null) or an SGPR (if
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
// true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
- SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only,
+ bool IsBuffer, bool HasSOffset,
+ int64_t ImmOffset,
+ bool *ScaleOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
return false;
int64_t ImmOff = 0;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
ImmOff = C->getSExtValue();
- return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
- ImmOff);
+ return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+ true, ImmOff, ScaleOffset);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2268,23 +2351,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N1;
return true;
}
return false;
}
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only) const {
- if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+ bool Imm32Only, bool *ScaleOffset) const {
+ if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+ /* IsBuffer */ false, /* HasSOffset */ false,
+ /* ImmOffset */ 0, ScaleOffset)) {
SBase = Expand32BitAddress(SBase);
return true;
}
@@ -2300,36 +2385,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
- /* Imm32Only */ true);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset, /* Imm32Only */ true);
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
- SDValue &SOffset) const {
- return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+ /* Imm32Only */ false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
- SDValue &SOffset,
- SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ true, /* IsBuffer */ true);
}
@@ -2338,9 +2438,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
// Match the (soffset + offset) pair as a 32-bit register base and
// an immediate offset.
return N.getValueType() == MVT::i32 &&
- SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
- &Offset, /* Imm32Only */ false,
- /* IsBuffer */ true);
+ SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+ /* SOffset*/ nullptr, &Offset,
+ /* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 9967f46..f7c7b3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -163,6 +163,12 @@ private:
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
+ bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
+ bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
@@ -170,22 +176,28 @@ private:
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+ bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
+ int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false) const;
+ bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only = false, bool IsBuffer = false,
+ bool HasSOffset = false, int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
+ bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false,
+ bool *ScaleOffset = nullptr) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
- bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
- bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
- SDValue &Offset) const;
+ bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+ bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+ SDValue &CPol) const;
+ bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d040fb..e3ca09e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -375,7 +375,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
- setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
@@ -1143,6 +1142,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
@@ -1168,6 +1168,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
@@ -5875,6 +5876,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
+ NODE_NAME_CASE(WHOLE_WAVE_SETUP)
+ NODE_NAME_CASE(WHOLE_WAVE_RETURN)
}
return nullptr;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4e8c6c7..39bb0ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -608,6 +608,12 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
+
+ // Set up a whole wave function.
+ WHOLE_WAVE_SETUP,
+
+ // Return from a whole wave function.
+ WHOLE_WAVE_RETURN,
};
} // End namespace AMDGPUISD
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index e2c2e89..f2207ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ Value *Src0 = II.getArgOperand(1);
+ Value *Src1 = II.getArgOperand(3);
+ unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
+ uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+ auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
+ auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
+
+ bool MadeChange = false;
+ unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+ unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+
+ // Depending on the used format, fewer registers are required so shrink the
+ // vector type.
+ if (Src0Ty->getNumElements() > Src0NumElts) {
+ Src0 = IC.Builder.CreateExtractVector(
+ FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
+ IC.Builder.getInt64(0));
+ MadeChange = true;
+ }
+
+ if (Src1Ty->getNumElements() > Src1NumElts) {
+ Src1 = IC.Builder.CreateExtractVector(
+ FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
+ IC.Builder.getInt64(0));
+ MadeChange = true;
+ }
+
+ if (!MadeChange)
+ return std::nullopt;
+
+ SmallVector<Value *, 13> Args(II.args());
+ Args[1] = Src0;
+ Args[3] = Src1;
+
+ CallInst *NewII = IC.Builder.CreateIntrinsic(
+ IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
+ Args, &II);
+ NewII->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewII);
+ }
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ce58e93..e305f08 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+// Marks the entry into a whole wave function.
+def AMDGPUwhole_wave_setup : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// Marks the return from a whole wave function.
+def AMDGPUwhole_wave_return : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1a63c48..d2e718c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
}
/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
return Register();
}
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+ Register SExtSrc;
+ if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+ return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+ // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI,
+ m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+ m_SpecificICst(31))))
+ return Def->getOperand(1).getReg();
+
+ if (VT->signBitIsZero(Reg))
+ return matchZeroExtendFromS32(Reg);
+
+ return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+ bool IsSigned) const {
+ if (IsSigned)
+ return matchSignExtendFromS32OrS32(Reg);
+
+ return matchZeroExtendFromS32OrS32(Reg);
+}
+
Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
Register AnyExtSrc;
if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
if (isSGPR(SAddr)) {
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
- if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
Addr = SAddr;
VOffset = Off;
}
@@ -4160,6 +4209,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
+ I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
+ return true;
+ }
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
@@ -5219,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
- Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ Register S32 = matchZeroExtendFromS32(Src);
if (!S32)
S32 = matchAnyExtendFromS32(Src);
@@ -5292,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
}};
}
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+ Register &Offset,
+ bool IsSigned) const {
+ if (!Subtarget->hasScaleOffset())
+ return false;
+
+ const MachineInstr &MI = *Root.getParent();
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ if (!MMO->getSize().hasValue())
+ return false;
+
+ uint64_t Size = MMO->getSize().getValue();
+
+ Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+ if (!OffsetReg)
+ OffsetReg = Offset;
+
+ if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+ OffsetReg = Def->Reg;
+
+ Register Op0;
+ MachineInstr *Mul;
+ bool ScaleOffset =
+ (isPowerOf2_64(Size) &&
+ mi_match(OffsetReg, *MRI,
+ m_GShl(m_Reg(Op0),
+ m_any_of(m_SpecificICst(Log2_64(Size)),
+ m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+ mi_match(OffsetReg, *MRI,
+ m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) ||
+ mi_match(
+ OffsetReg, *MRI,
+ m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+ m_Reg(Op0), m_SpecificICst(Size))) ||
+ // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+ (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+ (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+ : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+ (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+ VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+ mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+ mi_match(Mul->getOperand(3).getReg(), *MRI,
+ m_GTrunc(m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) &&
+ mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+ if (ScaleOffset)
+ Offset = Op0;
+
+ return ScaleOffset;
+}
+
bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Register &Base,
Register *SOffset,
- int64_t *Offset) const {
+ int64_t *Offset,
+ bool *ScaleOffset) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
@@ -5310,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
const GEPInfo &GEPI = AddrInfo[0];
std::optional<int64_t> EncodedImm;
+ if (ScaleOffset)
+ *ScaleOffset = false;
+
if (SOffset && Offset) {
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/true);
@@ -5317,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
- if (Register OffsetReg =
- matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+ Register OffsetReg = GEPI2.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset =
+ selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
@@ -5363,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
}
if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
- if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+ Register OffsetReg = GEPI.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI.SgprParts[0];
*SOffset = OffsetReg;
return true;
@@ -5377,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
Register Base;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+ if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+ /* ScaleOffset */ nullptr))
return std::nullopt;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5408,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
Register Base, SOffset;
- if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+ &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
Register Base, SOffset;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
std::pair<Register, int>
@@ -5485,7 +5615,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
+ unsigned CPolBits) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
@@ -5529,6 +5660,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
MIB.addReg(HighBits);
}, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
}
@@ -5559,7 +5691,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
@@ -5568,6 +5700,9 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits);
}}};
}
}
@@ -5591,11 +5726,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+ return selectGlobalSAddr(Root, 0);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
+ return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 2cb7904..e58fbb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ private:
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+ bool IsSigned) const;
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
- int64_t *Offset) const;
+ int64_t *Offset, bool *ScaleOffset) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -254,7 +256,11 @@ private:
selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
@@ -417,6 +423,19 @@ private:
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match a zero extend from a 32-bit value to 64-bits.
+ Register matchZeroExtendFromS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits.
+ Register matchSignExtendFromS32(Register Reg) const;
+ /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchZeroExtendFromS32OrS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchSignExtendFromS32OrS32(Register Reg) const;
+ /// Match either sign or zero extend depending on the \p IsSigned from a
+ /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+ Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
/// Match an any extend from a 32-bit value to 64-bit.
Register matchAnyExtendFromS32(Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fa8af68..304e91e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1583,15 +1583,13 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
if (!SplitUsers.contains(I))
continue;
- SmallVector<DbgValueInst *> Dbgs;
- findDbgValues(Dbgs, I);
- for (auto *Dbg : Dbgs) {
- IRB.SetInsertPoint(Dbg);
+ SmallVector<DbgVariableRecord *> Dbgs;
+ findDbgValues(I, Dbgs);
+ for (DbgVariableRecord *Dbg : Dbgs) {
auto &DL = I->getDataLayout();
assert(isSplitFatPtr(I->getType()) &&
"We should've RAUW'd away loads, stores, etc. at this point");
- auto *OffDbg = cast<DbgValueInst>(Dbg->clone());
- copyMetadata(OffDbg, Dbg);
+ DbgVariableRecord *OffDbg = Dbg->clone();
auto [Rsrc, Off] = getPtrParts(I);
int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType());
@@ -1606,9 +1604,9 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
if (OffExpr) {
OffDbg->setExpression(*OffExpr);
OffDbg->replaceVariableLocationOp(I, Off);
- IRB.Insert(OffDbg);
+ OffDbg->insertBefore(Dbg);
} else {
- OffDbg->deleteValue();
+ OffDbg->eraseFromParent();
}
if (RsrcExpr) {
Dbg->setExpression(*RsrcExpr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bf2f37b..f1caf24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4714,6 +4714,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
@@ -5540,6 +5541,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
break;
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 43d4e8db..421fc42 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -176,6 +176,8 @@ public:
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
ImmTyBitOp3,
+ ImmTyMatrixAFMT,
+ ImmTyMatrixBFMT,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
ImmTyByteSel,
@@ -423,6 +425,8 @@ public:
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+ bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
+ bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -1174,6 +1178,8 @@ public:
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
case ImmTyBitOp3: OS << "BitOp3"; break;
+ case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
+ case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
@@ -1714,6 +1720,10 @@ public:
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
ParseStatus parseIndexKey32bit(OperandVector &Operands);
+ ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name,
+ AMDGPUOperand::ImmTy Type);
+ ParseStatus parseMatrixAFMT(OperandVector &Operands);
+ ParseStatus parseMatrixBFMT(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@@ -1849,6 +1859,7 @@ private:
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
+ bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -5128,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
auto FB = getFeatureBits();
+ if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+ return true;
+
unsigned Opc = Inst.getOpcode();
+ const MCRegisterInfo *MRI = getMRI();
// DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows
// unaligned VGPR. All others only allow even aligned VGPRs.
- if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
+ if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
return true;
- const MCRegisterInfo *MRI = getMRI();
+ if (FB[AMDGPU::FeatureGFX1250Insts]) {
+ switch (Opc) {
+ default:
+ break;
+ case AMDGPU::DS_LOAD_TR6_B96:
+ case AMDGPU::DS_LOAD_TR6_B96_gfx12:
+ // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that
+ // allows unaligned VGPR. All others only allow even aligned VGPRs.
+ return true;
+ case AMDGPU::GLOBAL_LOAD_TR6_B96:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: {
+ // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that
+ // allows unaligned VGPR for vdst, but other operands still only allow
+ // even aligned VGPRs.
+ int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+ if (VAddrIdx != -1) {
+ const MCOperand &Op = Inst.getOperand(VAddrIdx);
+ MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ if ((Sub - AMDGPU::VGPR0) & 1)
+ return false;
+ }
+ return true;
+ }
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250:
+ return true;
+ }
+ }
+
const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
@@ -5280,6 +5323,28 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
+ if (!isGFX1250()) {
+ if (CPol & CPol::SCAL) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported on this GPU");
+ }
+ if (CPol & CPol::NV) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
+ Error(S, "nv is not supported on this GPU");
+ }
+ }
+
+ if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported for this instruction");
+ }
+
if (isGFX12Plus())
return validateTHAndScopeBits(Inst, Operands, CPol);
@@ -5400,6 +5465,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
+ const OperandVector &Operands) {
+ unsigned Opc = Inst.getOpcode();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool {
+ int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp);
+ if (FmtIdx == -1)
+ return true;
+ unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
+ int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
+ unsigned RegSize =
+ TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
+
+ if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
+ return true;
+
+ static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+ "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+ "MATRIX_FMT_FP4"};
+
+ Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
+ "wrong register tuple size for " + Twine(FmtNames[Fmt]));
+ return false;
+ };
+
+ return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) &&
+ validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1);
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
@@ -5533,6 +5629,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateTFE(Inst, Operands)) {
return false;
}
+ if (!validateWMMA(Inst, Operands)) {
+ return false;
+ }
return true;
}
@@ -6916,6 +7015,8 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
int64_t CPolVal = 0;
ParseStatus ResTH = ParseStatus::NoMatch;
ParseStatus ResScope = ParseStatus::NoMatch;
+ ParseStatus ResNV = ParseStatus::NoMatch;
+ ParseStatus ResScal = ParseStatus::NoMatch;
for (;;) {
if (ResTH.isNoMatch()) {
@@ -6940,10 +7041,36 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
}
}
+ // NV bit exists on GFX12+, but does something starting from GFX1250.
+ // Allow parsing on all GFX12 and fail on validation for better
+ // diagnostics.
+ if (ResNV.isNoMatch()) {
+ if (trySkipId("nv")) {
+ ResNV = ParseStatus::Success;
+ CPolVal |= CPol::NV;
+ continue;
+ } else if (trySkipId("no", "nv")) {
+ ResNV = ParseStatus::Success;
+ continue;
+ }
+ }
+
+ if (ResScal.isNoMatch()) {
+ if (trySkipId("scale_offset")) {
+ ResScal = ParseStatus::Success;
+ CPolVal |= CPol::SCAL;
+ continue;
+ } else if (trySkipId("no", "scale_offset")) {
+ ResScal = ParseStatus::Success;
+ continue;
+ }
+ }
+
break;
}
- if (ResTH.isNoMatch() && ResScope.isNoMatch())
+ if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() &&
+ ResScal.isNoMatch())
return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
@@ -7191,6 +7318,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
}
+ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
+ StringRef Name,
+ AMDGPUOperand::ImmTy Type) {
+ return parseStringOrIntWithPrefix(Operands, Name,
+ {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+ "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+ "MATRIX_FMT_FP4"},
+ Type);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) {
+ return tryParseMatrixFMT(Operands, "matrix_a_fmt",
+ AMDGPUOperand::ImmTyMatrixAFMT);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
+ return tryParseMatrixFMT(Operands, "matrix_b_fmt",
+ AMDGPUOperand::ImmTyMatrixBFMT);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9292,6 +9439,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
DefaultVal);
}
+ int MatrixAFMTIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt);
+ if (MatrixAFMTIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAFMT, 0);
+ }
+
+ int MatrixBFMTIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt);
+ if (MatrixBFMTIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBFMT, 0);
+ }
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixAReuse, 0);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 0caabe4..f99e716 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1488,7 +1488,6 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -2451,6 +2450,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
let Inst{62} = ps.offen;
let Inst{63} = ps.idxen;
+ let Inst{7} = cpol{5}; // nv
let Inst{54-53} = cpol{2-1}; // th{2-1}
let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
let Inst{51-50} = cpol{4-3}; // scope
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e219fe0..319cc9d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,7 +886,6 @@ defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 98f7e17..5c1989b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
convertMAIInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA)
+ convertWMMAInst(MI);
+
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
if (VDstIn_Idx != -1) {
@@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
return MO.setReg(
MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
case 8:
+ if (MCRegister NewReg = MRI.getSubReg(
+ MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) {
+ MO.setReg(NewReg);
+ }
+ return;
+ case 12: {
+ // There is no 384-bit subreg index defined.
+ MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0);
+ MCRegister NewReg = MRI.getMatchingSuperReg(
+ BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID));
+ return MO.setReg(NewReg);
+ }
+ case 16:
// No-op in cases where one operand is still f8/bf8.
return;
default:
- llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
+ llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand");
}
}
@@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
AdjustedRegClassOpcode->NumRegsSrcB);
}
+void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const {
+ int FmtAIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt);
+ if (FmtAIdx == -1)
+ return;
+
+ int FmtBIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt);
+
+ unsigned FmtA = MI.getOperand(FmtAIdx).getImm();
+ unsigned FmtB = MI.getOperand(FmtBIdx).getImm();
+
+ const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
+ AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode());
+ if (!AdjustedRegClassOpcode ||
+ AdjustedRegClassOpcode->Opcode == MI.getOpcode())
+ return;
+
+ MI.setOpcode(AdjustedRegClassOpcode->Opcode);
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int Src1Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
+ AdjustedRegClassOpcode->NumRegsSrcA);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
+ AdjustedRegClassOpcode->NumRegsSrcB);
+}
+
struct VOPModifiers {
unsigned OpSel = 0;
unsigned OpSelHi = 0;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 8404100..f4d164b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -161,6 +161,7 @@ public:
void convertFMAanyK(MCInst &MI) const;
void convertSDWAInst(MCInst &MI) const;
void convertMAIInst(MCInst &MI) const;
+ void convertWMMAInst(MCInst &MI) const;
void convertDPP8Inst(MCInst &MI) const;
void convertMIMGInst(MCInst &MI) const;
void convertVOP3DPPInst(MCInst &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c8a4e22..1cc717b 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,7 +11,8 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
- def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [], -10>;
+ def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
+ def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
}
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
bits<7> saddr;
bits<8> vdst;
- bits<6> cpol;
+ bits<12> cpol;
bits<8> vdata; // vsrc
bits<8> vaddr;
bits<24> offset;
@@ -192,6 +193,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{31-26} = 0x3b;
let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
let Inst{49} = ps.sve;
+ let Inst{7} = cpol{5}; // nv
let Inst{54-53} = cpol{2-1}; // th{2-1}
let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
let Inst{51-50} = cpol{4-3}; // scope
@@ -1252,13 +1254,13 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, (i32 0), $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1272,26 +1274,26 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
>;
class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, 0)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
- (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+ (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
-class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt, ValueType data_vt = vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)),
- (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
+class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
+ ValueType vt, ValueType data_vt = vt> : GCNPat <
+ (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
+ (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
>;
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
- (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1320,6 +1322,12 @@ multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ GlobalSAddr, vt, data_vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
}
multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
@@ -1338,6 +1346,11 @@ multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+ let AddedComplexity = 8;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
}
multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
@@ -1507,7 +1520,8 @@ multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
let AddedComplexity = 13 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ GlobalSAddr, vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1518,7 +1532,7 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
let AddedComplexity = 12 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt,
@@ -1797,12 +1811,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-let SubtargetPredicate = isGFX12Plus in {
- defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
+} // End OtherPredicates = [HasFlatAddressSpace]
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-}
+let OtherPredicates = [isGFX12Plus] in
+defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
+
+let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
+defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
let OtherPredicates = [HasD16LoadStore] in {
defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
@@ -1826,8 +1841,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
-} // End OtherPredicates = [HasFlatAddressSpace]
-
let OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
@@ -2928,6 +2941,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
let DecoderNamespace = "GFX12";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
@@ -3157,6 +3171,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
let DecoderNamespace = "GFX1250";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bbed828..94886b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
- std::next(MI->getReverseIterator()),
- 0, IsExpired, Visited);
+ std::next(MI->getReverseIterator()), 0, IsExpired,
+ Visited, SIInstrInfo::getNumWaitStates);
}
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
@@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
- fixWMMAHazards(MI);
+ fixWMMAHazards(MI); // fall-through if co-execution is enabled.
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+static bool isCoexecutableVALUInst(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
+ !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+}
+
+static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
+ const SIInstrInfo *TII, unsigned Latency,
+ unsigned Category) {
+ assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
+ "Handle me if the xdl wmma instruction latency changes");
+
+ switch (Category) {
+ case 0: // Dense WMMA Instructions:
+ // WMMA_*F16, WMMA_*BF16
+ // WMMA_*FP8FP8
+ // WMMA_*FP8BF8
+ // WMMA_*BF8FP8
+ // WMMA_*BF8BF8
+ // WMMA_*F8F6F4 if SRCA & SRCB != F8
+ return Latency == 8 && SIInstrInfo::isWMMA(MI);
+
+ case 1: // Dense WMMA Instructions:
+ // WMMA_IU8
+ // WMMA_IU4
+ // WMMA_*F8F6F4 if SRCA OR SRCB == F8
+ return Latency == 16 && SIInstrInfo::isWMMA(MI);
+
+ case 2: // Dense SWMMAC Instructions
+ // SWMMAC_*F16, SWMMAC_*BF16,
+ // SWMMAC_*FP8FP8
+ // SWMMAC_*BF8FP8
+ // SWMMAC_*FP8BF8
+ // SWMMAC_*BF8BF8
+ return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
+
+ case 3: // Sparse WMMA Instructions:
+ // SWMMAC_IU8
+ // SWMMAC_IU4
+ return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
+ default:
+ break;
+ } // end switch.
+
+ return false;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
+ // be in between the first WMMA and the second instruction to cover the hazard
+ // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
+ // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
+ // numbers, which depends on the category of the first WMMA.
+ const int WMMAWaitStates[] = {5, 9, 3, 5};
+ const int VALUWaitStates[] = {4, 8, 2, 4};
+ unsigned Category = 0;
+
+ auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D0, Idx1))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ // WMMA writes, VALU reads.
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI->regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
+ if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(I)) {
+ Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D1, Idx0))
+ return true;
+ }
+
+ return false;
+ };
+
+ int Limit = 0;
+ auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+
+ auto GetWaitStatesFn = [](const MachineInstr &I) {
+ return SIInstrInfo::isVALU(I) ? 1 : 0;
+ };
+
+ int WaitStatesNeeded = -1;
+ if (TII->isXDLWMMA(*MI)) {
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ } else { // Must be a co-executable VALU.
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = VALUWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ }
+
+ // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
+ // means not needed.
+ for (int i = 0; i < WaitStatesNeeded; i++)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
@@ -3206,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
auto NextMI = std::next(It);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ef6ddd8..f796eeae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -106,6 +106,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
+ bool fixWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b8f0f4..9a2bab1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
}
void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Track register pressure so the scheduler can try to decrease
// pressure once register usage is above the threshold defined by
// SIRegisterInfo::getRegPressureSetLimit()
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 268162b..407d79a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1022,7 +1022,7 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void mirFileLoaded(MachineFunction &MF) const override;
@@ -1162,6 +1162,9 @@ public:
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ // Scalar and global loads support scale_offset bit.
+ bool hasScaleOffset() const { return GFX1250Insts; }
+
bool hasFlatGVSMode() const { return FlatGVSMode; }
bool enableSIScheduler() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ec9248b..11b072e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -157,9 +157,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const int64_t TH = Imm & CPol::TH;
const int64_t Scope = Imm & CPol::SCOPE;
+ if (Imm & CPol::SCAL)
+ O << " scale_offset";
+
printTH(MI, TH, Scope, O);
printScope(Scope, O);
+ if (Imm & CPol::NV)
+ O << " nv";
+
return;
}
@@ -1342,6 +1348,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
O << " index_key:" << Imm;
}
+void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, char AorB) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " matrix_" << AorB << "_fmt:";
+ switch (Imm) {
+ default:
+ O << Imm;
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP8:
+ O << "MATRIX_FMT_FP8";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_BF8:
+ O << "MATRIX_FMT_BF8";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP6:
+ O << "MATRIX_FMT_FP6";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_BF6:
+ O << "MATRIX_FMT_BF6";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP4:
+ O << "MATRIX_FMT_FP4";
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixFMT(MI, OpNo, STI, O, 'a');
+}
+
+void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixFMT(MI, OpNo, STI, O, 'b');
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3299a6..e0b7aa5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -134,6 +134,12 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
+ void printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f48739f..c49ad79 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+ // Matrix B format operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
// Matrix B reuse operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index a864997..3902d4c 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -398,8 +398,12 @@ enum CPol {
SCOPE_DEV = 2 << 3,
SCOPE_SYS = 3 << 3,
+ NV = 1 << 5, // Non-volatile bit
+
SWZ = 1 << 6, // Swizzle bit
+ SCAL = 1 << 11, // Scale offset bit
+
ALL = TH | SCOPE,
// Helper bits
@@ -1003,6 +1007,16 @@ enum Target : unsigned {
} // namespace Exp
+namespace WMMA {
+enum MatrixFMT : unsigned {
+ MATRIX_FMT_FP8 = 0,
+ MATRIX_FMT_BF8 = 1,
+ MATRIX_FMT_FP6 = 2,
+ MATRIX_FMT_BF6 = 3,
+ MATRIX_FMT_FP4 = 4
+};
+} // namespace WMMA
+
namespace VOP3PEncoding {
enum OpSel : uint64_t {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e172c0b..e5d1eaa 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- // A frame index will resolve to a positive constant, so it should always be
- // safe to fold the addressing mode, even pre-GFX9.
- UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
-
const unsigned Opc = UseMI->getOpcode();
if (TII->isFLATScratch(*UseMI) &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
+ unsigned CPol =
+ TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
+ if ((CPol & AMDGPU::CPol::SCAL) &&
+ !AMDGPU::supportsScaleOffset(*TII, NewOpc))
+ return;
+
UseMI->setDesc(TII->get(NewOpc));
}
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6a38679..11552b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
- ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ if (FuncInfo->isWholeWaveFunction()) {
+ // Whole wave functions already have a copy of the original EXEC mask that
+ // we can use.
+ assert(IsProlog && "Epilog should look at return, not setup");
+ ScratchExecCopy =
+ TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
+ assert(ScratchExecCopy && "Couldn't find copy of EXEC");
+ } else {
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ }
+
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
@@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
};
StoreWWMRegisters(WWMScratchRegs);
+
+ auto EnableAllLanes = [&]() {
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ };
+
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ EnableAllLanes();
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
@@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
}
StoreWWMRegisters(WWMCalleeSavedRegs);
- if (ScratchExecCopy) {
+ if (FuncInfo->isWholeWaveFunction()) {
+ // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
+ // it now. If we have already saved some WWM CSR registers, then the EXEC is
+ // already -1 and we don't need to do anything else. Otherwise, set EXEC to
+ // -1 here.
+ if (!ScratchExecCopy)
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
+ /*EnableInactiveLanes*/ true);
+ else if (WWMCalleeSavedRegs.empty())
+ EnableAllLanes();
+ TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+ } else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
@@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
Register ScratchExecCopy;
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
- if (!WWMScratchRegs.empty())
- ScratchExecCopy =
- buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
- /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
-
auto RestoreWWMRegisters =
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
for (const auto &Reg : WWMRegs) {
@@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
}
};
+ if (FuncInfo->isWholeWaveFunction()) {
+ // For whole wave functions, the EXEC is already -1 at this point.
+ // Therefore, we can restore the CSR WWM registers right away.
+ RestoreWWMRegisters(WWMCalleeSavedRegs);
+
+ // The original EXEC is the first operand of the return instruction.
+ const MachineInstr &Return = MBB.instr_back();
+ assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
+ "Unexpected return inst");
+ Register OrigExec = Return.getOperand(0).getReg();
+
+ if (!WWMScratchRegs.empty()) {
+ unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+ .addReg(OrigExec)
+ .addImm(-1);
+ RestoreWWMRegisters(WWMScratchRegs);
+ }
+
+ // Restore original EXEC.
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+ return;
+ }
+
+ if (!WWMScratchRegs.empty()) {
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
+ /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
+ }
RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
@@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
(MFI->isChainFunction() &&
TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
@@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isEntryFunction())
return;
+ if (MFI->isWholeWaveFunction()) {
+ // In practice, all the VGPRs are WWM registers, and we will need to save at
+ // least their inactive lanes. Add them to WWMReservedRegs.
+ assert(!NeedExecCopyReservedReg &&
+ "Whole wave functions can use the reg mapped for their i1 argument");
+
+ // FIXME: Be more efficient!
+ for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+ if (MF.getRegInfo().isPhysRegModified(Reg)) {
+ MFI->reserveWWMRegister(Reg);
+ MF.begin()->addLiveIn(Reg);
+ }
+ MF.begin()->sortUniqueLiveIns();
+ }
+
// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2..bc0fd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FSIN, ISD::FROUND},
MVT::f16, Custom);
+ // BF16 - VOP1 Actions.
+ if (Subtarget->hasBF16TransInsts())
+ setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
+
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -2260,7 +2264,8 @@ SDValue SITargetLowering::getPreloadedValue(
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (Subtarget->hasArchitectedSGPRs() &&
- (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+ (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
@@ -2942,12 +2947,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!Subtarget->enableFlatScratch())
assert(!UserSGPRInfo.hasFlatScratchInit());
if ((CallConv != CallingConv::AMDGPU_CS &&
- CallConv != CallingConv::AMDGPU_Gfx) ||
+ CallConv != CallingConv::AMDGPU_Gfx &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
!Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
}
+ bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
if (CallConv == CallingConv::AMDGPU_PS) {
processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -2988,7 +2996,8 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- Splits.append(Ins.begin(), Ins.end());
+ Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+ Ins.end());
}
if (IsKernel)
@@ -3019,6 +3028,13 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
+ if (IsWholeWaveFunc) {
+ SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+ {MVT::i1, MVT::Other}, Chain);
+ InVals.push_back(Setup.getValue(0));
+ Chains.push_back(Setup.getValue(1));
+ }
+
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
@@ -3026,7 +3042,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+ ++i) {
const ISD::InputArg &Arg = Ins[i];
if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3374,7 +3391,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+ Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+ : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
+ : AMDGPUISD::RET_GLUE;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3876,7 +3895,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -5890,6 +5910,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+ assert(MFI->isWholeWaveFunction());
+
+ // During ISel, it's difficult to propagate the original EXEC mask to use as
+ // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+ MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+ Register OriginalExec = Setup->getOperand(0).getReg();
+ assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ MF->getRegInfo().clearKillFlags(OriginalExec);
+ MI.getOperand(0).setReg(OriginalExec);
+ return BB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
@@ -11172,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
// f16 is always accurate enough
- if (!AllowInaccurateRcp && VT != MVT::f16)
+ if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
return SDValue();
if (CLHS->isExactlyValue(1.0)) {
@@ -11199,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require afn or arcp.
+ // For f16 and bf16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+ if (!AllowInaccurateRcp &&
+ ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
return SDValue();
// Turn into multiply by the reciprocal.
@@ -11592,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f64)
return LowerFDIV64(Op, DAG);
- if (VT == MVT::f16)
+ if (VT == MVT::f16 || VT == MVT::bf16)
return LowerFDIV16(Op, DAG);
llvm_unreachable("Unexpected type for fdiv");
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2af0a57..9faf497 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::SI_RETURN ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index a368bc5..89d9b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -317,6 +317,8 @@ def CPolBit {
int SLC = 1;
int DLC = 2;
int SCC = 4;
+ int NV = 5;
+ int SCAL = 11;
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c8935f0..571f3ef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -5481,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+ if (CPol->getImm() & AMDGPU::CPol::SCAL) {
+ if (!ST.hasScaleOffset()) {
+ ErrInfo = "Subtarget does not support offset scaling";
+ return false;
+ }
+ if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
+ ErrInfo = "Instruction does not support offset scaling";
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -5757,6 +5771,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
}
+MachineInstr *
+SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
+ assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
+ "Not a whole wave func");
+ MachineBasicBlock &MBB = *MF.begin();
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
+ MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ return &MI;
+
+ llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
+}
+
static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
const MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5e92921..800ea9a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1215,6 +1215,8 @@ public:
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
Register Reg, SlotIndexes *Indexes = nullptr) const;
+ MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const;
+
/// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9e1951e..bd4995b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
def BitOp3 : NamedIntOperand<"bitop3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
+def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
+def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
+
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
@@ -1882,6 +1885,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
!eq(VT, v4bf16) : AVSrc_64,
!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@@ -1894,6 +1898,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
class getVOP3VRegSrcForVT<ValueType VT> {
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@@ -2666,6 +2671,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
HasOMod);
field bit HasNeg = HasModifiers;
field bit HasMatrixReuse = 0;
+ field bit HasMatrixFMT = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 991d9f8..d05be8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
let isConvergent = 1;
}
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI <
+ (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+
+ let isConvergent = 1;
+}
+
+// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN.
+def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
+ (outs), (ins SReg_1:$orig_exec)> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -2473,6 +2499,7 @@ def : AMDGPUPat <
>;
let True16Predicate = NotHasTrue16BitInsts in {
+let SubtargetPredicate = isNotGFX9Plus in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2482,6 +2509,35 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+} // isNotGFX9Plus
+
+let SubtargetPredicate = isGFX9GFX10 in {
+def : GCNPat <
+ (rotr i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src0,
+ /* src2_modifiers */ 0,
+ $src1, /* clamp */ 0, /* op_sel */ 0)
+>;
+
+foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+ (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
+def : GCNPat<pat,
+ (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ 0, /* src1_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
+ 0, /* src2_modifiers */
+ $src1, /* clamp */ 0, /* op_sel */ 0)
+>;
+
+def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+ (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src1,
+ /* src2_modifiers */ 0,
+ $src2, /* clamp */ 0, /* op_sel */ 0)
+>;
+} // isGFX9GFX10
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
@@ -3082,6 +3138,8 @@ def : GCNPat <
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
+// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped
+// to V_PERM_B32.
let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(i32 (bswap i32:$a)),
@@ -3559,15 +3617,20 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
-let True16Predicate = NotHasTrue16BitInsts in
-def : GCNPat <
+let True16Predicate = NotHasTrue16BitInsts in {
+defvar BuildVectorToAlignBitPat =
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
- (Ty VGPR_32:$b))),
- (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
->;
+ (Ty VGPR_32:$b)));
+
+let SubtargetPredicate = isNotGFX9Plus in
+def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>;
+
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>;
+} //True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
@@ -4300,6 +4363,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$origExec);
+ let InOperandList = (ins);
+ let isConvergent = 1;
+}
+
+def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$origExec);
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 5097ac03..b49c5a9 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -61,6 +61,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if (EltOffset0 + CI.Width != EltOffset1 &&
EltOffset1 + Paired.Width != EltOffset0)
return false;
- if (CI.CPol != Paired.CPol)
+ // Instructions with scale_offset modifier cannot be combined unless we
+ // also generate a code to scale the offset and reset that bit.
+ if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
return false;
if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 75ce67c..f0be204 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
using namespace llvm;
+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
+cl::opt<bool> MFMAVGPRForm(
+ "amdgpu-mfma-vgpr-form", cl::Hidden,
+ cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+ "unspecified, default to compiler heuristics"),
+ cl::init(false));
+
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -41,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+ IsWholeWaveFunction(F.getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave) {
const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -69,8 +81,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts();
- if (ST.hasGFX90AInsts() &&
+ MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
+ if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
@@ -89,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ImplicitArgPtr = false;
} else if (!isEntryFunction()) {
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx &&
+ CC != CallingConv::AMDGPU_Gfx_WholeWave)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
FrameOffsetReg = AMDGPU::SGPR33;
@@ -722,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -768,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60ad..08b0206 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue LongBranchReservedReg;
bool HasInitWholeWave = false;
+ bool IsWholeWaveFunction = false;
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
+ YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
}
};
@@ -565,6 +567,8 @@ private:
// the serialization easier.
ReservedRegSet WWMReservedRegs;
+ bool IsWholeWaveFunction = false;
+
using PrologEpilogSGPRSpill =
std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
// To track the SGPR spill method used for a CSR SGPR register during
@@ -670,6 +674,8 @@ public:
return WWMReservedRegs.contains(Reg);
}
+ bool isWholeWaveFunction() const { return IsWholeWaveFunction; }
+
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
return PrologEpilogSGPRSpills;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 7093fe6..5940f45 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -85,7 +85,8 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_WGP_MODE(ProgInfo.WgpMode) |
- S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) |
+ S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
if (ST.hasDX10ClampMode())
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
@@ -93,10 +94,6 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
if (ST.hasIEEEMode())
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
- // TODO: in the long run we will want to enable this unconditionally.
- if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
- Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
-
if (ST.hasRrWGMode())
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fa2b8db..84cfa87 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
: CSR_AMDGPU_SaveList;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
: CSR_AMDGPU_SI_Gfx_SaveList;
case CallingConv::AMDGPU_CS_ChainPreserve:
@@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
: CSR_AMDGPU_RegMask;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
: CSR_AMDGPU_SI_Gfx_RegMask;
case CallingConv::AMDGPU_CS_Chain:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c194e5c..0039d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>;
def VRegSrc_128: SrcReg9<VReg_128>;
def VRegSrc_192: SrcReg9<VReg_192>;
def VRegSrc_256: SrcReg9<VReg_256>;
+def VRegSrc_384: SrcReg9<VReg_384>;
def VRegSrc_512: SrcReg9<VReg_512>;
def VRegSrc_1024: SrcReg9<VReg_1024>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index ef8faff..8eecb1c 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX12SpeedModel
+// Check if any matrix inputs are interpreted as f8 in an f8f6f4
+// wmma instruction.
+def PredIsF8_WMMA_SCALE : SchedPredicate<[{
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 ||
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8
+}]>;
+
+// If either matrix format is f8, the instruction takes 2x as many
+// cycles. TODO: This isn't reflected in MCA.
+def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
+ SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>,
+ SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
+]>;
+
multiclass GFX125xCommonWriteRes {
let ReleaseAtCycles = [8] in
@@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // End GFX125xCommonWriteRes
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 37dcc100..38cc51b 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -87,7 +87,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
bits<7> sdst;
bits<32> offset;
bits<8> soffset;
- bits<5> cpol;
+ bits<12> cpol;
}
class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+ def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+ def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
let SubtargetPredicate = isNotGFX9Plus;
}
def : GCNPat <
- (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+ (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
// 4. SGPR+IMM offset
def : GCNPat <
- (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+ (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
// 2. SGPR offset
def : GCNPat <
- (node (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
// 3. SGPR+IMM offset
def : GCNPat <
- (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
@@ -1485,8 +1487,10 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+ let Inst{20} = cpol{CPolBit.NV}; // non-volatile
let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+ let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
}
multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7725881..b5b3cc9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -598,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
}
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) {
+ switch (Fmt) {
+ case WMMA::MATRIX_FMT_FP8:
+ case WMMA::MATRIX_FMT_BF8:
+ return 16;
+ case WMMA::MATRIX_FMT_FP6:
+ case WMMA::MATRIX_FMT_BF6:
+ return 12;
+ case WMMA::MATRIX_FMT_FP4:
+ return 8;
+ }
+
+ llvm_unreachable("covered switch over wmma scale formats");
+}
+
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+ unsigned FmtB,
+ unsigned F8F8Opcode) {
+ uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+ uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+ return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
+}
+
unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
return SIEncodingFamily::GFX1250;
@@ -3205,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
+ uint64_t TSFlags = MII.get(Opcode).TSFlags;
+
+ if (TSFlags & SIInstrFlags::SMRD)
+ return !getSMEMIsBuffer(Opcode);
+ if (!(TSFlags & SIInstrFlags::FLAT))
+ return false;
+
+ // Only SV and SVS modes are supported.
+ if (TSFlags & SIInstrFlags::FlatScratch)
+ return hasNamedOperand(Opcode, OpName::vaddr);
+
+ // Only GVS mode is supported.
+ return hasNamedOperand(Opcode, OpName::vaddr) &&
+ hasNamedOperand(Opcode, OpName::saddr);
+
+ return false;
+}
+
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c9d2c28..c09a9d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -627,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
unsigned BLGP,
unsigned F8F8Opcode);
+LLVM_READNONE
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt);
+
+LLVM_READONLY
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+ unsigned FmtB,
+ unsigned F8F8Opcode);
+
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
@@ -1423,7 +1431,8 @@ constexpr bool isShader(CallingConv::ID CC) {
LLVM_READNONE
constexpr bool isGraphics(CallingConv::ID CC) {
- return isShader(CC) || CC == CallingConv::AMDGPU_Gfx;
+ return isShader(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave;
}
LLVM_READNONE
@@ -1748,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);
+/// \returns true if a memory instruction supports scale_offset modifier.
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
+
/// \returns lds block size in terms of dwords. \p
/// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
/// must be defined in terms of bytes.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index e464470..fd6253d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) {
case CallingConv::AMDGPU_LS:
return ".ls";
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
llvm_unreachable("Callable shader has no hardware stage");
default:
return ".cs";
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 2e7f25b..aee2f2c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -224,6 +224,12 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
fshr, null_frag>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32.
+// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored.
+defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+
let True16Predicate = UseRealTrue16Insts in
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
let True16Predicate = UseFakeTrue16Insts in
@@ -265,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
} // End isReMaterializable = 1
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat <
+(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
+ (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)),
+ (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))),
+(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0,
+ i32:$src1_modifiers, VSrc_b32:$src1,
+ i32:$src2_modifiers, VGPR_32:$src2)
+>;
+
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
@@ -1954,6 +1970,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
+defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2104,8 +2123,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>;
defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
@@ -2248,6 +2267,17 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
+// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
+// The following is created to support that.
+multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> {
+ defvar psName = opName#"_e64";
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
+ VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
@@ -2267,8 +2297,10 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
+let SubtargetPredicate = isGFX8Only in {
defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
+}
defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
@@ -2313,6 +2345,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
+defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e51e957..9feea36 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1318,13 +1318,15 @@ let WaveSizePredicate = isWave64 in {
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
- bit _HasMatrixReuse = 0, bit _IsF4 = 0>
+ bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
+ bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
int IndexType = _IndexType;
+ let HasMatrixFMT = _HasMatrixFMT;
let HasMatrixReuse = _HasMatrixReuse;
bit HasIModOp = _Has_ImodOp;
@@ -1422,7 +1424,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
!eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
-
+ dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
+ (ins));
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
@@ -1436,7 +1439,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
(ins VRegSrc_64:$src2),
(ins VRegSrc_32:$src2)),
IndexKey)),
- MatrixReuse, Clamp, Neg);
+ MatrixFMT, MatrixReuse, Clamp, Neg);
// asm
@@ -1444,13 +1447,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : "$index_key_8bit",
!eq(IndexType, 16) : "$index_key_16bit",
!eq(IndexType, 32) : "$index_key_32bit");
+ string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
- let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
@@ -1462,6 +1466,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
NoABMods : (ins Src0VT:$src0));
dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
@@ -1474,6 +1479,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
@@ -1499,7 +1505,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsIUXF32 : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
-
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
@@ -1508,6 +1513,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+ dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
@@ -1515,7 +1521,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
@@ -1523,7 +1529,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
}
def WMMAInstInfoTable : GenericTable {
@@ -1632,26 +1638,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
-def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
-def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
-def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
-def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
-def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
-def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
-def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
-def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
-def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
-def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
+
+multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
+ def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+}
+
+defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>;
+
+multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
+ foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">;
+ }
+}
let WaveSizePredicate = isWave32 in {
let SubtargetPredicate = isGFX125xOnly in {
@@ -1697,6 +1722,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x12
defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
+
} // End is_wmma_xdl = 1.
} // End SubtargetPredicate = isGFX125xOnly
@@ -1854,6 +1881,10 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+ foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
+ }
+
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
@@ -1912,17 +1943,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME
class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
: VOP3Pe_gfx11_gfx12<op, P>{
+
// opsel
- let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
+ let Inst{11} = !cond(WMMAP.HasMatrixFMT : matrix_a_fmt{0},
+ !eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
!eq(WMMAP.IndexType, 16) : index_key_16bit{0},
!eq(WMMAP.IndexType, 32) : index_key_32bit{0});
- let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
- let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
+ let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1},
+ !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0));
+ let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2},
+ !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0));
// opsel_hi
- let Inst{59} = 1;
- let Inst{60} = 1;
- let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
+ let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1);
+ let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1);
+ let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
+ !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1961,6 +1997,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
+multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ let AsmString = asmName # PS.AsmOperands in
+ defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
+}
+
+multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
+ defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+ foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+ defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ }
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -2035,6 +2089,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
+
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a25ebdf..c21e2d3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -453,6 +453,8 @@ class VOP3Pe_Base {
bits<2> index_key_8bit;
bits<1> index_key_16bit;
bits<1> index_key_32bit;
+ bits<3> matrix_a_fmt;
+ bits<3> matrix_b_fmt;
bits<1> matrix_a_reuse;
bits<1> matrix_b_reuse;
}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index eaba6fe..a7a9911 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -593,7 +593,7 @@ public:
getContext().reportError(Loc, "relocated expression must be 32-bit");
return;
}
- getOrCreateDataFragment();
+ getCurrentFragment();
}
emitDataMappingSymbol();
@@ -1207,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
}
void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
- MCFragment *Frag = getOrCreateDataFragment();
+ MCFragment *Frag = getCurrentFragment();
Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
}
@@ -1295,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
visitUsedExpr(*PersonalityRef);
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
DF->addFixup(
MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index db09738..128cc0b 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -514,19 +514,7 @@ bool AVRAsmBackend::forceRelocation(const MCFragment &F, const MCFixup &Fixup,
return false;
case AVR::fixup_7_pcrel:
- case AVR::fixup_13_pcrel: {
- uint64_t Offset = Target.getConstant();
- uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
-
- // If the jump is too large to encode it, fall back to a relocation.
- //
- // Note that trying to actually link that relocation *would* fail, but the
- // hopes are that the module we're currently compiling won't be actually
- // linked to the final binary.
- return !adjust::adjustRelativeBranch(Size, Fixup, Offset,
- getContext().getSubtargetInfo());
- }
-
+ case AVR::fixup_13_pcrel:
case AVR::fixup_call:
return true;
}
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 703a9e5..c8866bf 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -24,7 +24,6 @@
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -240,11 +239,6 @@ public:
for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
F.removeParamAttrs(Idx, AttrMask);
- // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
- if (Intrinsic::ID IID = F.getIntrinsicID();
- IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- F.removeFnAttr(Attribute::Memory);
-
for (auto &BB : F) {
IRBuilder<> Builder(&BB);
for (auto &I : make_early_inc_range(BB)) {
@@ -253,7 +247,7 @@ public:
// Emtting NoOp bitcast instructions allows the ValueEnumerator to be
// unmodified as it reserves instruction IDs during contruction.
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (auto LI = dyn_cast<LoadInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, LI->getPointerOperand(),
LI->getType())) {
@@ -263,7 +257,7 @@ public:
}
continue;
}
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (auto SI = dyn_cast<StoreInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, SI->getPointerOperand(),
SI->getValueOperand()->getType())) {
@@ -274,7 +268,7 @@ public:
}
continue;
}
- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, GEP->getPointerOperand(),
GEP->getSourceElementType()))
@@ -286,17 +280,6 @@ public:
CB->removeRetAttrs(AttrMask);
for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
CB->removeParamAttrs(Idx, AttrMask);
- // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we
- // insert a bitcast here to ensure that is the case
- if (isa<LifetimeIntrinsic>(CB)) {
- Value *PtrOperand = CB->getArgOperand(1);
- Builder.SetInsertPoint(CB);
- PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
- Value *NoOpBitcast = Builder.Insert(
- CastInst::Create(Instruction::BitCast, PtrOperand,
- Builder.getPtrTy(PtrTy->getAddressSpace())));
- CB->setArgOperand(1, NoOpBitcast);
- }
continue;
}
}
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index eb4adfe..bd3349d 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -152,7 +152,7 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
if (!CSF.Int64Ops)
CSF.Int64Ops = I.getType()->isIntegerTy(64);
- if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) {
+ if (!CSF.Int64Ops) {
for (const Value *Op : I.operands()) {
if (Op->getType()->isIntegerTy(64)) {
CSF.Int64Ops = true;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 46d5d71..1d79c30 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
Vals.clear();
}
-// HLSL Change
-namespace {
-struct ValueNameCreator {
- MallocAllocator Allocator;
- SmallVector<ValueName *, 2>
- ValueNames; // SmallVector N = 2 because we currently only expect this
- // to hold ValueNames for Lifetime intrinsics
- ~ValueNameCreator() {
- for (auto *VN : ValueNames)
- VN->Destroy(Allocator);
- }
- ValueName *create(StringRef Name, Value *V) {
- ValueName *VN = ValueName::create(Name, Allocator, V);
- ValueNames.push_back(VN);
- return VN;
- }
-};
-} // anonymous namespace
-
// Emit names for globals/functions etc.
void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
const ValueSymbolTable &VST) {
@@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
// to ensure the binary is the same no matter what values ever existed.
SmallVector<const ValueName *, 16> SortedTable;
- // HLSL Change
- ValueNameCreator VNC;
for (auto &VI : VST) {
- ValueName *VN = VI.second->getValueName();
- // Clang mangles lifetime intrinsic names by appending '.p0' to the end,
- // making them invalid lifetime intrinsics in LLVM 3.7. We can't
- // demangle in dxil-prepare because it would result in invalid IR.
- // Therefore we have to do this in the bitcode writer while writing its
- // name to the symbol table.
- if (const Function *Fn = dyn_cast<Function>(VI.getValue());
- Fn && Fn->isIntrinsic()) {
- Intrinsic::ID IID = Fn->getIntrinsicID();
- if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- VN = VNC.create(Intrinsic::getBaseName(IID), VI.second);
- }
- SortedTable.push_back(VN);
+ SortedTable.push_back(VI.second->getValueName());
}
-
// The keys are unique, so there shouldn't be stability issues.
llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
return A->first() < B->first();
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2378664..e915a3c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
assert(ResTy.isVector());
unsigned NumElts = ResTy.getVectorNumElements();
- SDValue Vector = DAG.getUNDEF(ResTy);
- for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Vector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+ for (unsigned i = 1; i < NumElts; ++i) {
Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
Node->getOperand(i),
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
@@ -4560,6 +4561,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
llvm_unreachable("Unexpected node type for vXi1 sign extension");
}
+static SDValue
+performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse())
+ return SDValue();
+
+ bool UseLASX;
+ unsigned Opc = ISD::DELETED_NODE;
+ EVT CmpVT = Src.getOperand(0).getValueType();
+ EVT EltVT = CmpVT.getVectorElementType();
+
+ if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128)
+ UseLASX = false;
+ else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
+ CmpVT.getSizeInBits() == 256)
+ UseLASX = true;
+ else
+ return SDValue();
+
+ SDValue SrcN1 = Src.getOperand(1);
+ switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
+ default:
+ break;
+ case ISD::SETEQ:
+ // x == 0 => not (vmsknez.b x)
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
+ break;
+ case ISD::SETGT:
+ // x > -1 => vmskgez.b x
+ if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+ break;
+ case ISD::SETGE:
+ // x >= 0 => vmskgez.b x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+ break;
+ case ISD::SETLT:
+ // x < 0 => vmskltz.{b,h,w,d} x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+ EltVT == MVT::i64))
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+ break;
+ case ISD::SETLE:
+ // x <= -1 => vmskltz.{b,h,w,d} x
+ if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+ EltVT == MVT::i64))
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+ break;
+ case ISD::SETNE:
+ // x != 0 => vmsknez.b x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
+ break;
+ }
+
+ if (Opc == ISD::DELETED_NODE)
+ return SDValue();
+
+ SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0));
+ EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, T);
+ return DAG.getBitcast(VT, V);
+}
+
static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const LoongArchSubtarget &Subtarget) {
@@ -4574,110 +4649,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
- unsigned Opc = ISD::DELETED_NODE;
// Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible
+ SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget);
+ if (Res)
+ return Res;
+
+ // Generate vXi1 using [X]VMSKLTZ
+ MVT SExtVT;
+ unsigned Opc;
+ bool UseLASX = false;
+ bool PropagateSExt = false;
+
if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) {
- bool UseLASX;
EVT CmpVT = Src.getOperand(0).getValueType();
- EVT EltVT = CmpVT.getVectorElementType();
-
- if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128)
- UseLASX = false;
- else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
- CmpVT.getSizeInBits() <= 256)
- UseLASX = true;
- else
+ if (CmpVT.getSizeInBits() > 256)
return SDValue();
-
- SDValue SrcN1 = Src.getOperand(1);
- switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
- default:
- break;
- case ISD::SETEQ:
- // x == 0 => not (vmsknez.b x)
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
- break;
- case ISD::SETGT:
- // x > -1 => vmskgez.b x
- if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
- break;
- case ISD::SETGE:
- // x >= 0 => vmskgez.b x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
- break;
- case ISD::SETLT:
- // x < 0 => vmskltz.{b,h,w,d} x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
- EltVT == MVT::i64))
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- break;
- case ISD::SETLE:
- // x <= -1 => vmskltz.{b,h,w,d} x
- if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
- EltVT == MVT::i64))
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- break;
- case ISD::SETNE:
- // x != 0 => vmsknez.b x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
- break;
- }
}
- // Generate vXi1 using [X]VMSKLTZ
- if (Opc == ISD::DELETED_NODE) {
- MVT SExtVT;
- bool UseLASX = false;
- bool PropagateSExt = false;
- switch (SrcVT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v2i1:
- SExtVT = MVT::v2i64;
- break;
- case MVT::v4i1:
- SExtVT = MVT::v4i32;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v4i64;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v8i1:
- SExtVT = MVT::v8i16;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v8i32;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v16i1:
- SExtVT = MVT::v16i8;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v16i16;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v32i1:
- SExtVT = MVT::v32i8;
+ switch (SrcVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i1:
+ SExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ SExtVT = MVT::v4i32;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v4i64;
UseLASX = true;
- break;
- };
- if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX())
- return SDValue();
- Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
- : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- } else {
- Src = Src.getOperand(0);
- }
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v8i1:
+ SExtVT = MVT::v8i16;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v8i32;
+ UseLASX = true;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v16i1:
+ SExtVT = MVT::v16i8;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v16i16;
+ UseLASX = true;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v32i1:
+ SExtVT = MVT::v32i8;
+ UseLASX = true;
+ break;
+ };
+ if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
+ return SDValue();
+ Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+ : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index a0107e4..5096a8f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1651,18 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
- (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
- (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+ (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
(XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
(XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+
+// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+ (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+ (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 962e7c2..3c9defb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1842,10 +1842,19 @@ def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$
(VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
(VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
-def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
- (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
-def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
- (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>;
+
+// VEXTRINS_{W/D}
+foreach imm = 0...3 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm),
+ (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>;
+}
+
+foreach imm = 0...1 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm),
+ (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>;
+}
// scalar_to_vector
def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 7b9f115..8fa72bc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -177,74 +177,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
}
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function returns the total Nops Size we need to insert.
-bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
- const MCAlignFragment &AF, unsigned &Size) {
- // Calculate Nops Size only when linker relaxation enabled.
- if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
- return false;
-
- // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size.
- const unsigned MinNopLen = 4;
- if (AF.getMaxBytesToEmit() < MinNopLen)
- return false;
- Size = AF.getAlignment().value() - MinNopLen;
- return AF.getAlignment() > MinNopLen;
-}
-
-// We need to insert R_LARCH_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function inserts fixup_loongarch_align fixup which eventually will
-// transfer to R_LARCH_ALIGN relocation type.
-// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of
-// addend represent alignment and the other bits of addend represent the
-// maximum number of bytes to emit. The maximum number of bytes is zero
-// means ignore the emit limit.
-bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) {
- // Insert the fixup only when linker relaxation enabled.
- if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
- return false;
-
- // Calculate total Nops we need to insert. If there are none to insert
- // then simply return.
- unsigned InsertedNopBytes;
- if (!shouldInsertExtraNopBytesForCodeAlign(AF, InsertedNopBytes))
- return false;
-
- MCSection *Sec = AF.getParent();
- MCContext &Ctx = getContext();
- const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
- MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_LARCH_ALIGN);
- unsigned MaxBytesToEmit = AF.getMaxBytesToEmit();
-
- auto createExtendedValue = [&]() {
- const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
- if (MCSym == nullptr) {
- // Define a marker symbol at the section with an offset of 0.
- MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
- Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
- Asm.registerSymbol(*Sym);
- MCSym = MCSymbolRefExpr::create(Sym, Ctx);
- getSecToAlignSym()[Sec] = MCSym;
- }
- return MCValue::get(&MCSym->getSymbol(), nullptr,
- MaxBytesToEmit << 8 | Log2(AF.getAlignment()));
- };
-
- uint64_t FixedValue = 0;
- MCValue Value = MaxBytesToEmit >= InsertedNopBytes
- ? MCValue::get(InsertedNopBytes)
- : createExtendedValue();
- Asm.getWriter().recordRelocation(AF, Fixup, Value, FixedValue);
-
- return true;
-}
-
bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
const MCValue &Target) {
switch (Fixup.getKind()) {
@@ -279,6 +211,53 @@ getRelocPairForSize(unsigned Size) {
}
}
+// Check if an R_LARCH_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend. If MaxBytesToEmit is smaller than the padding
+// size, the fixup encodes MaxBytesToEmit in the higher bits and references a
+// per-section marker symbol.
+bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+ // Use default handling unless linker relaxation is enabled and the
+ // MaxBytesToEmit >= the nop size.
+ if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
+ return false;
+ const unsigned MinNopLen = 4;
+ unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit();
+ if (MaxBytesToEmit < MinNopLen)
+ return false;
+
+ Size = F.getAlignment().value() - MinNopLen;
+ if (F.getAlignment() <= MinNopLen)
+ return false;
+
+ MCContext &Ctx = getContext();
+ const MCExpr *Expr = nullptr;
+ if (MaxBytesToEmit >= Size) {
+ Expr = MCConstantExpr::create(Size, getContext());
+ } else {
+ MCSection *Sec = F.getParent();
+ const MCSymbolRefExpr *SymRef = getSecToAlignSym()[Sec];
+ if (SymRef == nullptr) {
+ // Define a marker symbol at the section with an offset of 0.
+ MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
+ Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
+ Asm->registerSymbol(*Sym);
+ SymRef = MCSymbolRefExpr::create(Sym, Ctx);
+ getSecToAlignSym()[Sec] = SymRef;
+ }
+ Expr = MCBinaryExpr::createAdd(
+ SymRef,
+ MCConstantExpr::create((MaxBytesToEmit << 8) | Log2(F.getAlignment()),
+ Ctx),
+ Ctx);
+ }
+ MCFixup Fixup =
+ MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
+ F.setVarFixups({Fixup});
+ F.getParent()->setLinkerRelaxable();
+ return true;
+}
+
std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F,
int64_t &Value) const {
const MCExpr &Expr = F.getLEBValue();
@@ -434,7 +413,7 @@ bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
// Otherwise, check if the offset between the symbol and fragment is fully
// resolved, unaffected by linker-relaxable fragments (e.g. instructions or
- // offset-affected MCAlignFragment). Complements the generic
+ // offset-affected FT_Align fragments). Complements the generic
// isSymbolRefDifferenceFullyResolvedImpl.
if (!PCRelTemp)
PCRelTemp = getContext().createTempSymbol();
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index b32ba06..3d929fc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -45,20 +45,13 @@ public:
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved) override;
- // Return Size with extra Nop Bytes for alignment directive in code section.
- bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
- unsigned &Size) override;
-
- // Insert target specific fixup type for alignment directive in code section.
- bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) override;
-
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
+ bool relaxAlign(MCFragment &F, unsigned &Size) override;
bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
std::pair<bool, bool> relaxLEB128(MCFragment &F,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004..7cefb3f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
bool Is64Bit = TT.isArch64Bit();
ABI TripleABI;
switch (TT.getEnvironment()) {
+ case llvm::Triple::EnvironmentType::UnknownEnvironment:
+ TripleABI = ABI_Unknown;
+ break;
case llvm::Triple::EnvironmentType::GNUSF:
case llvm::Triple::EnvironmentType::MuslSF:
TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// 1. If the '-target-abi' is valid, use it.
if (IsABIValidForFeature(ArgProvidedABI)) {
- if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+ if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
errs()
<< "warning: triple-implied ABI conflicts with provided target-abi '"
<< ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
return Is64Bit ? ABI_LP64F : ABI_ILP32F;
return Is64Bit ? ABI_LP64S : ABI_ILP32S;
};
- if (ABIName.empty())
- errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
- "feature-implied ABI\n";
- else
+ if (!ABIName.empty())
errs() << "warning: both target-abi and the triple-implied ABI are "
"invalid, ignoring and using feature-implied ABI\n";
return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index ad8f5f0..7abe9c9 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) {
if (hasRelocationAddend())
return;
- // Sort relocations by the address they are applied to.
- llvm::sort(Relocs,
- [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
- return A.Offset < B.Offset;
- });
+ // Sort relocations by r_offset. There might be more than one at an offset
+ // with composed relocations or .reloc directives.
+ llvm::stable_sort(
+ Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+ return A.Offset < B.Offset;
+ });
// Place relocations in a list for reorder convenience. Hi16 contains the
// iterators of high-part relocations.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index b89d689..feb4eb3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1033,45 +1033,40 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
}
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_GPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_GPREL32));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
+ S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_DTPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_DTPREL64));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_TPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_TPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_TPREL64));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_TPREL64);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7aa06f9..7883acc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -731,6 +731,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
// PTX does not support load / store predicate registers
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -4004,7 +4006,10 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v2i32;
Info.ptrVal = I.getArgOperand(0);
@@ -4027,6 +4032,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = Align(4);
+ return true;
+ }
+
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::v4i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = Align(16);
+ return true;
+ }
+
case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
@@ -5060,12 +5089,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return !U.getUser()->use_empty();
}
- // Handle CopyToReg nodes that will become dead after our replacement
- if (U.getUser()->getOpcode() == ISD::CopyToReg) {
- DeadCopyToRegs.push_back(U.getUser());
- return true;
- }
-
// Otherwise, this use prevents us from splitting a value.
return false;
}))
@@ -5132,10 +5155,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
Results.push_back(NewLoad.getValue(NewNumOutputs + I));
- // Remove dead CopyToReg nodes by folding them into the chain they reference
- for (SDNode *CTR : DeadCopyToRegs)
- DCI.CombineTo(CTR, CTR->getOperand(0));
-
return DCI.DAG.getMergeValues(Results, DL);
}
@@ -6544,4 +6563,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
default:
break;
}
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a5bb83d..b5df4c6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 70150bd..0a00220 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
-class TMA_DIMS_UTIL<int dim> {
+class TMA_DIMS_UTIL<int dim, string mode = ""> {
// For example, when 'dim' is 3, this generates:
// an ins_dag: B32:$d0, B32:$d1, B32:$d2
// with base_str: $d0, $d1, $d2
dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+
+ // Tile::Gather4/scatter4 actually operate on a 2D tensor,
+ // though they take 5 co-ordinates.
+ //
+ // The scatter-gather happens over 4 rows with a fixed
+ // column-index. The first co-ordinate represents the
+ // col-index followed by four row-indices.
+ int num_dims = !cond(
+ !eq(mode, "tile_scatter4") : 2,
+ !eq(mode, "tile_gather4") : 2,
+ true : dim); // for all other modes
}
class TMA_IM2COL_UTIL<int dim, string mode> {
@@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+ defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+ defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+ defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+ defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+ defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+ defvar asm_str = !if(!empty(im2col_str),
+ asm_str_base,
+ asm_str_base # ", {{" # im2col_str # "}}");
+
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+ defvar inst_name = "cp.async.bulk.tensor"
+ # "." # dim_val # "d"
+ # "." # "shared::cluster.global"
+ # "." # !subst("_", "::", mode)
+ # "." # "mbarrier::complete_tx::bytes";
+ defvar intr = !cast<Intrinsic>(
+ "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d");
+
+ defvar ins_dag = !con(
+ (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+ dims_dag, im2col_dag,
+ (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg));
+
+ defvar intr_dag_base = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B16:$mc, B64:$ch));
+ defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
+ defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
+ defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
+ defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+
+ def "" : NVPTXInst<(outs), ins_dag,
+ inst_name # asm_str # ";",
+ [intr_dag_no_hints]>,
+ Requires<pred>;
+ def _MC : NVPTXInst<(outs), ins_dag,
+ inst_name # ".multicast::cluster" # asm_str # ", $mc;",
+ [intr_dag_with_mc]>,
+ Requires<pred>;
+ def _CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+ [intr_dag_with_ch]>,
+ Requires<pred>;
+ def _MC_CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;",
+ [intr_dag_with_mc_ch]>,
+ Requires<pred>;
+}
+foreach dim = 3...5 in {
+ foreach mode = ["im2col_w", "im2col_w_128"] in {
+ defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+ }
+}
+defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
+ [hasTMACTAGroupSupport]>;
+
+multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
+ defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+ defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+ defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+ defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+ defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+ defvar asm_str = !if(!empty(im2col_str),
+ asm_str_base,
+ asm_str_base # ", {{" # im2col_str # "}}");
+
+ defvar ins_dag = !con(
+ (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+ dims_dag, im2col_dag,
+ (ins B64:$ch));
+
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+ defvar intr = !cast<Intrinsic>(
+ "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d");
+ defvar intr_dag = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B64:$ch, 0));
+ defvar intr_dag_with_ch = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B64:$ch, -1));
+ defvar inst_name = "cp.async.bulk.tensor"
+ # "." # dim_val # "d"
+ # "." # "shared::cta.global"
+ # "." # !subst("_", "::", mode)
+ # "." # "mbarrier::complete_tx::bytes";
+
+ def "" : NVPTXInst<(outs), ins_dag,
+ inst_name # asm_str # ";",
+ [intr_dag]>,
+ Requires<pred>;
+ def _CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+ [intr_dag_with_ch]>,
+ Requires<pred>;
+}
+foreach dim = 1...5 in {
+ defm TMA_G2S_CTA_TILE_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>;
+}
+foreach dim = 3...5 in {
+ defm TMA_G2S_CTA_IM2COL_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>;
+
+ defm TMA_G2S_CTA_IM2COL_W_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
+
+ defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+}
+defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
+ [hasPTX<86>, hasSM<100>]>;
+
multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
defvar intr = !cast<Intrinsic>(
- "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d);
+ "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d");
+
defvar intr_dag = !con((intr addr:$src, B64:$tmap),
!setdagop(dims_dag, intr),
(intr B64:$ch, 0));
@@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
!setdagop(dims_dag, intr),
(intr B64:$ch, -1));
- // For im2col mode, the actual asm_str is "im2col_no_offs"
- defvar mode_asm_str = !if(!eq(mode, "im2col"),
- "im2col_no_offs", mode);
+ // Fix-up the asm_str when it is im2col/scatter4.
+ defvar mode_asm_str = !cond(
+ !eq(mode, "im2col") : "im2col_no_offs",
+ !eq(mode, "tile_scatter4") : "tile::scatter4",
+ true : mode);
defvar prefix = "cp.async.bulk.tensor"
- # "." # dim # "d"
+ # "." # dim_val # "d"
# ".global.shared::cta"
# "." # mode_asm_str
# ".bulk_group";
@@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
}
foreach dim = 1...5 in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
- defvar suffix = !toupper(mode) # "_" # dim # D;
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>;
}
}
+defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
+ [hasTMACTAGroupSupport]>;
def TMAReductionFlags : Operand<i32> {
let PrintMethod = "printTmaReductionMode";
@@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
asm_str_base,
asm_str_base # ", {{" # im2col_str # "}}");
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
defvar inst_name = "cp.async.bulk.prefetch.tensor"
- # "." # dim # "d"
+ # "." # dim_val # "d"
# "." # "L2.global"
- # "." # mode;
+ # "." # !subst("_", "::", mode);
defvar intr = !cast<Intrinsic>(
- "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d);
+ "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d");
defvar ins_dag = !con((ins B64:$tmap),
dims_dag,
@@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
}
foreach dim = 1...5 in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
- defvar suffix = !toupper(mode) # "_" # dim # D;
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>;
}
}
+foreach dim = 3...5 in {
+ foreach mode = ["im2col_w", "im2col_w_128"] in {
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
+ defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
+ [hasTMACTAGroupSupport]>;
+ }
+}
+defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
+ [hasTMACTAGroupSupport]>;
//Prefetch and Prefetchu
@@ -4609,7 +4758,14 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
!and(!eq(op, "ldmatrix"),
!eq(ptx_elt_type, "b8x16.b4x16_p64"),
- !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
+ !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
+
+ !and(!eq(op, "stmatrix"),!eq(ptx_elt_type, "b16"),
+ !eq(geom, "m8n8")) : [hasSM<90>, hasPTX<78>],
+
+ !and(!eq(op, "stmatrix"),
+ !eq(ptx_elt_type, "b8"),
+ !eq(geom, "m16n8")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
// template DAGs for instruction inputs/output.
dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -4890,6 +5046,42 @@ defset list<WMMA_INSTR> LDMATRIXs = {
} // transposed
} // defset
+//
+// stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
+//
+class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
+ : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>,
+ Requires<Frag.Predicates> {
+ // Build PatFrag that only matches particular address space.
+ dag PFOperands = !con((ops node:$dst),
+ !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names));
+ PatFrag IntrFrag = PatFrag<PFOperands,
+ !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+ !cond(!eq(Space, ".shared"): AS_match.shared,
+ true: AS_match.generic)>;
+ // Build AS-constrained pattern.
+ let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+ let OutOperandList = (outs);
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
+ let AsmString = "stmatrix.sync.aligned."
+ # Frag.geom
+ # "." # Frag.frag
+ # !if(Transposed, ".trans", "")
+ # Space
+ # "." # Frag.ptx_elt_type
+ # " [$dst], " # Frag.regstring # ";";
+}
+
+// Create all stmatrix variants
+defset list<WMMA_INSTR> STMATRIXs = {
+ foreach transposed = [false, true] in {foreach space = [".shared", ""] in {
+ foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in
+ if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then
+ def : STMATRIX<WMMA_REGINFO<frag, "stmatrix">, transposed, space>;
+ } // space
+ } // transposed
+} // defset
+
// Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
// dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
// the instruction record.
@@ -4900,7 +5092,7 @@ class MMA_PAT<WMMA_INSTR wi>
Requires<wi.Predicates>;
// Build intrinsic->instruction patterns for all MMA instructions.
-foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
+foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs, STMATRIXs) in
def : MMA_PAT<mma>;
multiclass MAPA<string suffix, Intrinsic Intr> {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 1ac91fa..80fac18 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in {
let Predicates = [HasVSX, IsISAFuture] in {
let mayLoad = 1 in {
- def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
- def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVRL
+ : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVRLL
+ : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVPRL
+ : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVPRLL
+ : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
}
let mayStore = 1 in {
- def STXVRL : XX1Form_memOp<31, 653, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def STXVRLL : XX1Form_memOp<31, 685, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
+ def STXVRL
+ : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def STXVRLL
+ : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 75a0272..996b6ef 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
}
void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// The GenericScheduler that we use defaults to scheduling bottom up only.
// We want to schedule from both the top and the bottom and so we set
// OnlyBottomUp to false.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 9a97d1a..3c59a47 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -240,7 +240,8 @@ public:
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
bool useAA() const override;
bool enableSubRegLiveness() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f76f8b3..2c37c3b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -302,6 +302,28 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
Inst = std::move(Res);
}
+// Check if an R_RISCV_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend.
+bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+ // Use default handling unless linker relaxation is enabled and the alignment
+ // is larger than the nop size.
+ const MCSubtargetInfo *STI = F.getSubtargetInfo();
+ if (!STI->hasFeature(RISCV::FeatureRelax))
+ return false;
+ unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
+ if (F.getAlignment() <= MinNopLen)
+ return false;
+
+ Size = F.getAlignment().value() - MinNopLen;
+ auto *Expr = MCConstantExpr::create(Size, getContext());
+ MCFixup Fixup =
+ MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
+ F.setVarFixups({Fixup});
+ F.getParent()->setLinkerRelaxable();
+ return true;
+}
+
bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
MCContext &C = getContext();
@@ -637,7 +659,7 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
// Otherwise, check if the offset between the symbol and fragment is fully
// resolved, unaffected by linker-relaxable fragments (e.g. instructions or
- // offset-affected MCAlignFragment). Complements the generic
+ // offset-affected FT_Align fragments). Complements the generic
// isSymbolRefDifferenceFullyResolvedImpl.
if (!PCRelTemp)
PCRelTemp = getContext().createTempSymbol();
@@ -887,55 +909,6 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
}
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function return the total Nops Size we need to insert.
-bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
- const MCAlignFragment &AF, unsigned &Size) {
- // Calculate Nops Size only when linker relaxation enabled.
- const MCSubtargetInfo *STI = AF.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
- return false;
-
- unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
-
- if (AF.getAlignment() <= MinNopLen) {
- return false;
- } else {
- Size = AF.getAlignment().value() - MinNopLen;
- return true;
- }
-}
-
-// We need to insert R_RISCV_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function insert fixup_riscv_align fixup which eventually will
-// transfer to R_RISCV_ALIGN relocation type.
-bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) {
- // Insert the fixup only when linker relaxation enabled.
- const MCSubtargetInfo *STI = AF.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
- return false;
-
- // Calculate total Nops we need to insert. If there are none to insert
- // then simply return.
- unsigned Count;
- if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0))
- return false;
-
- MCContext &Ctx = getContext();
- const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
- MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_RISCV_ALIGN);
-
- uint64_t FixedValue = 0;
- MCValue NopBytes = MCValue::get(Count);
- Asm.getWriter().recordRelocation(AF, Fixup, NopBytes, FixedValue);
- return true;
-}
-
std::unique_ptr<MCObjectTargetWriter>
RISCVAsmBackend::createObjectTargetWriter() const {
return createRISCVELFObjectWriter(OSABI, Is64Bit);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 8c10fbe..d97d632 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -38,14 +38,6 @@ public:
const MCTargetOptions &Options);
~RISCVAsmBackend() override = default;
- // Return Size with extra Nop Bytes for alignment directive in code section.
- bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
- unsigned &Size) override;
-
- // Insert target specific fixup type for alignment directive in code section.
- bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) override;
-
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
bool addReloc(const MCFragment &, const MCFixup &, const MCValue &,
@@ -73,6 +65,7 @@ public:
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
+ bool relaxAlign(MCFragment &F, unsigned &Size) override;
bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
std::pair<bool, bool> relaxLEB128(MCFragment &LF,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index aeda5ac..5abb546 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -52,15 +52,6 @@ namespace RISCV {
#include "RISCVGenSearchableTables.inc"
} // namespace RISCV
-// Report an error but don't ask the user to report a bug.
-// TODO: Remove these wrappers.
-[[noreturn]] static void reportError(const char *Reason) {
- reportFatalUsageError(Reason);
-}
-[[noreturn]] static void reportError(Error Err) {
- reportFatalUsageError(std::move(Err));
-}
-
namespace RISCVABI {
ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
StringRef ABIName) {
@@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
if ((TargetABI == RISCVABI::ABI::ABI_ILP32E ||
(TargetABI == ABI_Unknown && IsRVE && !IsRV64)) &&
FeatureBits[RISCV::FeatureStdExtD])
- reportError("ILP32E cannot be used with the D ISA extension");
+ reportFatalUsageError("ILP32E cannot be used with the D ISA extension");
if (TargetABI != ABI_Unknown)
return TargetABI;
@@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// If no explicit ABI is given, try to compute the default ABI.
auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits);
if (!ISAInfo)
- reportError(ISAInfo.takeError());
+ reportFatalUsageError(ISAInfo.takeError());
return getTargetABI((*ISAInfo)->computeDefaultABI());
}
@@ -137,12 +128,12 @@ namespace RISCVFeatures {
void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit])
- reportError("RV64 target requires an RV64 CPU");
+ reportFatalUsageError("RV64 target requires an RV64 CPU");
if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit])
- reportError("RV32 target requires an RV32 CPU");
+ reportFatalUsageError("RV32 target requires an RV32 CPU");
if (FeatureBits[RISCV::Feature32Bit] &&
FeatureBits[RISCV::Feature64Bit])
- reportError("RV32 and RV64 can't be combined");
+ reportFatalUsageError("RV32 and RV64 can't be combined");
}
llvm::Expected<std::unique_ptr<RISCVISAInfo>>
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index cbf039e..4c303a9 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
(sequence "F%u_D", 0, 31))>;
+defvar VREGS = (add (sequence "V%u", 0, 31),
+ (sequence "V%uM2", 0, 31, 2),
+ (sequence "V%uM4", 0, 31, 4),
+ (sequence "V%uM8", 0, 31, 8));
+
// Same as CSR_Interrupt, but including all vector registers.
-def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 32-bit FP registers and all vector
// registers.
-def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 64-bit FP registers and all vector
// registers.
-def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but excluding X16-X31.
def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt,
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 23b4554..b1ab76a 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1544,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset;
}
+static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
+ const Register &Reg) {
+ MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
+ // If it's not a grouped vector register, it doesn't have subregister, so
+ // the base register is just itself.
+ if (BaseReg == RISCV::NoRegister)
+ BaseReg = Reg;
+ return BaseReg;
+}
+
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ // In TargetFrameLowering::determineCalleeSaves, any vector register is marked
+ // as saved if any of its subregister is clobbered, this is not correct in
+ // vector registers. We only want the vector register to be marked as saved
+ // if all of its subregisters are clobbered.
+ // For example:
+ // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked.
+ // Correct behavior: v24m2 is marked only if v24 and v25 are marked.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned CSReg = CSRegs[i];
+ // Only vector registers need special care.
+ if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg)))
+ continue;
+
+ SavedRegs.reset(CSReg);
+
+ auto SubRegs = TRI.subregs(CSReg);
+ // Set the register and all its subregisters.
+ if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
+ SavedRegs.set(CSReg);
+ llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+ }
+
+ // Combine to super register if all of its subregisters are marked.
+ if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) {
+ return SavedRegs.test(Reg);
+ }))
+ SavedRegs.set(CSReg);
+ }
+
// Unconditionally spill RA and FP only if the function uses a frame
// pointer.
if (hasFP(MF)) {
@@ -2137,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) {
: 8;
}
-static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
- const Register &Reg) {
- MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
- // If it's not a grouped vector register, it doesn't have subregister, so
- // the base register is just itself.
- if (BaseReg == RISCV::NoRegister)
- BaseReg = Reg;
- return BaseReg;
-}
-
void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
MachineFunction *MF = MBB.getParent();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cfec46d2..a541c2f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3106,6 +3106,25 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
return true;
}
+bool RISCVDAGToDAGISel::SelectAddrRegZextRegScale(SDValue Addr,
+ unsigned MaxShiftAmount,
+ unsigned Bits, SDValue &Base,
+ SDValue &Index,
+ SDValue &Scale) {
+ if (!SelectAddrRegRegScale(Addr, MaxShiftAmount, Base, Index, Scale))
+ return false;
+
+ if (Index.getOpcode() == ISD::AND) {
+ auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
+ if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
+ Index = Index.getOperand(0);
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (Addr.getOpcode() != ISD::ADD)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 72e2f96..ee3a86e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -59,19 +59,14 @@ public:
return SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale);
}
+ bool SelectAddrRegZextRegScale(SDValue Addr, unsigned MaxShiftAmount,
+ unsigned Bits, SDValue &Base, SDValue &Index,
+ SDValue &Scale);
+
template <unsigned MaxShift, unsigned Bits>
bool SelectAddrRegZextRegScale(SDValue Addr, SDValue &Base, SDValue &Index,
SDValue &Scale) {
- if (SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale)) {
- if (Index.getOpcode() == ISD::AND) {
- auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
- if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
- Index = Index.getOperand(0);
- return true;
- }
- }
- }
- return false;
+ return SelectAddrRegZextRegScale(Addr, MaxShift, Bits, Base, Index, Scale);
}
bool SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4845a9c..d859db3a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2319,6 +2319,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
if (getLegalZfaFPImm(Imm, VT) >= 0)
return true;
+ // Some constants can be produced by fli+fneg.
+ if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0)
+ return true;
+
// Cannot create a 64 bit floating-point immediate value for rv32.
if (Subtarget.getXLen() < VT.getScalarSizeInBits()) {
// td can handle +0.0 or -0.0 already.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index e23001a..d9c6101 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -174,6 +174,7 @@ class EltDeps<bit vl, bit mask> {
def EltDepsNone : EltDeps<vl=0, mask=0>;
def EltDepsVL : EltDeps<vl=1, mask=0>;
+def EltDepsMask : EltDeps<vl=0, mask=1>;
def EltDepsVLMask : EltDeps<vl=1, mask=1>;
class EEW <bits<2> val> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aef410f..17067220 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -44,45 +44,48 @@ def simm10_unsigned : RISCVOp {
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm10<bits<7> funct7, string opcodestr,
- DAGOperand TyImm10 = simm10>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10),
- opcodestr, "$rd, $imm10"> {
+class RVPLoadImm10<bits<7> funct7, string opcodestr,
+ DAGOperand TyImm10 = simm10>
+ : RVInst<(outs GPR:$rd), (ins TyImm10:$imm10), opcodestr, "$rd, $imm10", [],
+ InstFormatOther> {
bits<10> imm10;
+ bits<5> rd;
let Inst{31-25} = funct7;
let Inst{24-16} = imm10{8-0};
let Inst{15} = imm10{9};
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm8<bits<8> funct8, string opcodestr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8),
- opcodestr, "$rd, $uimm8"> {
+class RVPLoadImm8<bits<8> funct8, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
+ InstFormatOther> {
bits<8> uimm8;
+ bits<5> rd;
let Inst{31-24} = funct8;
let Inst{23-16} = uimm8;
let Inst{15} = 0b0;
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
- bits<5> imm;
- bits<5> rs1;
-
let Inst{31} = 0b1;
let Inst{30-28} = f;
let Inst{27} = 0b0;
- let Inst{19-15} = rs1;
}
class RVPUnaryImm5<bits<3> f, string opcodestr>
: RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
bits<5> uimm5;
- let imm = uimm5;
let Inst{26-25} = 0b01;
let Inst{24-20} = uimm5;
}
@@ -145,11 +148,11 @@ def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">;
+def PLI_H : RVPLoadImm10<0b1011000, "pli.h">;
let Predicates = [HasStdExtP, IsRV64] in
-def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">;
+def PLI_W : RVPLoadImm10<0b1011001, "pli.w">;
let Predicates = [HasStdExtP] in
-def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">;
+def PLI_B : RVPLoadImm8<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
@@ -162,6 +165,6 @@ def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>;
+def PLUI_H : RVPLoadImm10<0b1111000, "plui.h", simm10_unsigned>;
let Predicates = [HasStdExtP, IsRV64] in
-def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>;
+def PLUI_W : RVPLoadImm10<0b1111001, "plui.w", simm10_unsigned>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 5d13a87..33c7138 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1642,7 +1642,7 @@ def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
def : MnemonicAlias<"vpopc.m", "vcpop.m">;
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask in {
let DestEEW = EEW1 in {
// vmsbf.m set-before-first mask bit
@@ -1655,7 +1655,7 @@ defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
// Vector Iota Instruction
defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask
// Vector Element Index Instruction
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index c7cb6e2..f391300 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1377,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in {
def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
(QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
- (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+ (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
- (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+ (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
} // Predicates = [HasVendorXqciac, IsRV32]
/// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index dd68a55..878401e 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -131,25 +131,56 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
: Constant::getAllOnesValue(XLenTy);
return true;
}
- auto *VPLdSt = cast<VPIntrinsic>(I);
- assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
- VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
- "Unexpected intrinsic");
- Ptr = VPLdSt->getMemoryPointerParam();
- Alignment = VPLdSt->getPointerAlignment().value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- assert(Mask && "vp.load and vp.store needs a mask!");
+ auto *II = cast<IntrinsicInst>(I);
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unsupported intrinsic type");
+ case Intrinsic::vp_load:
+ case Intrinsic::vp_store: {
+ auto *VPLdSt = cast<VPIntrinsic>(I);
+ Ptr = VPLdSt->getMemoryPointerParam();
+ Alignment = VPLdSt->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
+
+ assert(Mask && "vp.load and vp.store needs a mask!");
+
+ Value *WideEVL = VPLdSt->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
+ return false;
- Value *WideEVL = VPLdSt->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
- return false;
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ return true;
+ }
+ case Intrinsic::masked_load: {
+ Ptr = II->getOperand(0);
+ Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- return true;
+ if (!isa<UndefValue>(II->getOperand(3)))
+ return false;
+
+ assert(Mask && "masked.load needs a mask!");
+
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ case Intrinsic::masked_store: {
+ Ptr = II->getOperand(1);
+ Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue();
+
+ assert(Mask && "masked.store needs a mask!");
+
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ }
}
/// Lower an interleaved load into a vlsegN intrinsic.
@@ -201,6 +232,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
+ Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
CI->addParamAttr(0,
Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
@@ -272,8 +304,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
Intrinsic::experimental_vp_strided_store,
{Data->getType(), BasePtr->getType(), Stride->getType()},
{Data, BasePtr, Stride, Mask, VL});
+ Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes);
CI->addParamAttr(
- 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+ 1, Attribute::getWithAlignment(CI->getContext(), Alignment));
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 28d6403..3b19c34 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -48,6 +48,8 @@ using namespace llvm;
STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
STATISTIC(NumTransformedToWInstrs,
"Number of instructions transformed to W-ops");
+STATISTIC(NumTransformedToNonWInstrs,
+ "Number of instructions transformed to non-W-ops");
static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
cl::desc("Disable removal of sext.w"),
@@ -67,10 +69,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
bool removeSExtWInstrs(MachineFunction &MF, const RISCVInstrInfo &TII,
const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
- bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
- bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
+ bool canonicalizeWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
+ const RISCVSubtarget &ST,
+ MachineRegisterInfo &MRI);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -721,45 +722,39 @@ bool RISCVOptWInstrs::removeSExtWInstrs(MachineFunction &MF,
return MadeChange;
}
-bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
- const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST,
- MachineRegisterInfo &MRI) {
+// Strips or adds W suffixes to eligible instructions depending on the
+// subtarget preferences.
+bool RISCVOptWInstrs::canonicalizeWSuffixes(MachineFunction &MF,
+ const RISCVInstrInfo &TII,
+ const RISCVSubtarget &ST,
+ MachineRegisterInfo &MRI) {
+ bool ShouldStripW = !(DisableStripWSuffix || ST.preferWInst());
+ bool ShouldPreferW = ST.preferWInst();
bool MadeChange = false;
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default:
- continue;
- case RISCV::ADDW: Opc = RISCV::ADD; break;
- case RISCV::ADDIW: Opc = RISCV::ADDI; break;
- case RISCV::MULW: Opc = RISCV::MUL; break;
- case RISCV::SLLIW: Opc = RISCV::SLLI; break;
- }
- if (hasAllWUsers(MI, ST, MRI)) {
- MI.setDesc(TII.get(Opc));
- MadeChange = true;
- }
- }
- }
-
- return MadeChange;
-}
-
-bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
- const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST,
- MachineRegisterInfo &MRI) {
- bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- unsigned WOpc;
- // TODO: Add more?
- switch (MI.getOpcode()) {
+ std::optional<unsigned> WOpc;
+ std::optional<unsigned> NonWOpc;
+ unsigned OrigOpc = MI.getOpcode();
+ switch (OrigOpc) {
default:
continue;
+ case RISCV::ADDW:
+ NonWOpc = RISCV::ADD;
+ break;
+ case RISCV::ADDIW:
+ NonWOpc = RISCV::ADDI;
+ break;
+ case RISCV::MULW:
+ NonWOpc = RISCV::MUL;
+ break;
+ case RISCV::SLLIW:
+ NonWOpc = RISCV::SLLI;
+ break;
+ case RISCV::SUBW:
+ NonWOpc = RISCV::SUB;
+ break;
case RISCV::ADD:
WOpc = RISCV::ADDW;
break;
@@ -773,7 +768,7 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
WOpc = RISCV::MULW;
break;
case RISCV::SLLI:
- // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+ // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits.
if (MI.getOperand(2).getImm() >= 32)
continue;
WOpc = RISCV::SLLIW;
@@ -784,19 +779,30 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
break;
}
- if (hasAllWUsers(MI, ST, MRI)) {
+ if (ShouldStripW && NonWOpc.has_value() && hasAllWUsers(MI, ST, MRI)) {
+ LLVM_DEBUG(dbgs() << "Replacing " << MI);
+ MI.setDesc(TII.get(NonWOpc.value()));
+ LLVM_DEBUG(dbgs() << " with " << MI);
+ ++NumTransformedToNonWInstrs;
+ MadeChange = true;
+ continue;
+ }
+ // LWU is always converted to LW when possible as 1) LW is compressible
+ // and 2) it helps minimise differences vs RV32.
+ if ((ShouldPreferW || OrigOpc == RISCV::LWU) && WOpc.has_value() &&
+ hasAllWUsers(MI, ST, MRI)) {
LLVM_DEBUG(dbgs() << "Replacing " << MI);
- MI.setDesc(TII.get(WOpc));
+ MI.setDesc(TII.get(WOpc.value()));
MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
MI.clearFlag(MachineInstr::MIFlag::IsExact);
LLVM_DEBUG(dbgs() << " with " << MI);
++NumTransformedToWInstrs;
MadeChange = true;
+ continue;
}
}
}
-
return MadeChange;
}
@@ -813,12 +819,6 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
-
- if (!(DisableStripWSuffix || ST.preferWInst()))
- MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
-
- if (ST.preferWInst())
- MadeChange |= appendWSuffixes(MF, TII, ST, MRI);
-
+ MadeChange |= canonicalizeWSuffixes(MF, TII, ST, MRI);
return MadeChange;
}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index c754de4..e35ffaf 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const {
}
void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Do bidirectional scheduling since it provides a more balanced scheduling
// leading to better performance. This will increase compile time.
Policy.OnlyTopDown = false;
@@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackPressure = true;
}
-void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+void RISCVSubtarget::overridePostRASchedPolicy(
+ MachineSchedPolicy &Policy, const SchedRegion &Region) const {
MISched::Direction PostRASchedDirection = getPostRASchedDirection();
if (PostRASchedDirection == MISched::TopDown) {
Policy.OnlyTopDown = true;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4f560cc..fd57e02 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -395,11 +395,11 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
};
-} // End llvm namespace
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 12bf8c1..d62d99c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -116,8 +116,8 @@ public:
}
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
- return ST->hasVInstructions() ? TailFoldingStyle::Data
- : TailFoldingStyle::DataWithoutLaneMask;
+ return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
+ : TailFoldingStyle::None;
}
std::optional<unsigned> getMaxVScale() const override;
std::optional<unsigned> getVScaleForTuning() const override;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index e656e8b..b53d919 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -33,6 +33,7 @@ namespace {
class RISCVVLOptimizer : public MachineFunctionPass {
const MachineRegisterInfo *MRI;
const MachineDominatorTree *MDT;
+ const TargetInstrInfo *TII;
public:
static char ID;
@@ -1291,7 +1292,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
return false;
}
- assert(!RISCVII::elementsDependOnVL(RISCV::getRVVMCOpcode(MI.getOpcode())) &&
+ assert(!RISCVII::elementsDependOnVL(
+ TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
"Instruction shouldn't be supported if elements depend on VL");
assert(MI.getOperand(0).isReg() &&
@@ -1484,7 +1486,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
}
bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
- assert(DemandedVLs.size() == 0);
if (skipFunction(MF.getFunction()))
return false;
@@ -1495,6 +1496,10 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (!ST.hasVInstructions())
return false;
+ TII = ST.getInstrInfo();
+
+ assert(DemandedVLs.empty());
+
// For each instruction that defines a vector, compute what VL its
// downstream users demand.
for (MachineBasicBlock *MBB : post_order(&MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef539..c1cc19b 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
return false;
+ // Masked off lanes past TrueVL will come from False, and converting to vmv
+ // will lose these lanes unless MIVL <= TrueVL.
+ // TODO: We could relax this for False == Passthru and True policy == TU
+ const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+ const MachineOperand &TrueVL =
+ True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+ if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+ return false;
+
// True's passthru needs to be equivalent to False
Register TruePassthruReg = True->getOperand(1).getReg();
Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 6608b3f..d4fa62a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -296,6 +296,8 @@ private:
bool selectImageWriteIntrinsic(MachineInstr &I) const;
bool selectResourceGetPointer(Register &ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectModf(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
// Utilities
std::pair<Register, bool>
@@ -3235,6 +3237,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_discard: {
return selectDiscard(ResVReg, ResType, I);
}
+ case Intrinsic::modf: {
+ return selectModf(ResVReg, ResType, I);
+ }
default: {
std::string DiagMsg;
raw_string_ostream OS(DiagMsg);
@@ -4018,6 +4023,83 @@ bool SPIRVInstructionSelector::selectLog10(Register ResVReg,
.constrainAllUses(TII, TRI, RBI);
}
+bool SPIRVInstructionSelector::selectModf(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ // llvm.modf has a single arg --the number to be decomposed-- and returns a
+ // struct { restype, restype }, while OpenCLLIB::modf has two args --the
+ // number to be decomposed and a pointer--, returns the fractional part and
+ // the integral part is stored in the pointer argument. Therefore, we can't
+ // use directly the OpenCLLIB::modf intrinsic. However, we can do some
+ // scaffolding to make it work. The idea is to create an alloca instruction
+ // to get a ptr, pass this ptr to OpenCL::modf, and then load the value
+ // from this ptr to place it in the struct. llvm.modf returns the fractional
+ // part as the first element of the result, and the integral part as the
+ // second element of the result.
+
+ // At this point, the return type is not a struct anymore, but rather two
+ // independent elements of SPIRVResType. We can get each independent element
+ // from I.getDefs() or I.getOperands().
+ if (STI.canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) {
+ MachineIRBuilder MIRBuilder(I);
+ // Get pointer type for alloca variable.
+ const SPIRVType *PtrType = GR.getOrCreateSPIRVPointerType(
+ ResType, MIRBuilder, SPIRV::StorageClass::Function);
+ // Create new register for the pointer type of alloca variable.
+ Register PtrTyReg =
+ MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
+ MIRBuilder.getMRI()->setType(
+ PtrTyReg,
+ LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function),
+ GR.getPointerSize()));
+ // Assign SPIR-V type of the pointer type of the alloca variable to the
+ // new register.
+ GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF());
+ MachineBasicBlock &EntryBB = I.getMF()->front();
+ MachineBasicBlock::iterator VarPos =
+ getFirstValidInstructionInsertPoint(EntryBB);
+ auto AllocaMIB =
+ BuildMI(EntryBB, VarPos, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+ .addDef(PtrTyReg)
+ .addUse(GR.getSPIRVTypeID(PtrType))
+ .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function));
+ Register Variable = AllocaMIB->getOperand(0).getReg();
+ // Modf must have 4 operands, the first two are the 2 parts of the result,
+ // the third is the operand, and the last one is the floating point value.
+ assert(I.getNumOperands() == 4 &&
+ "Expected 4 operands for modf instruction");
+ MachineBasicBlock &BB = *I.getParent();
+ // Create the OpenCLLIB::modf instruction.
+ auto MIB =
+ BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
+ .addImm(CL::modf)
+ .setMIFlags(I.getFlags())
+ .add(I.getOperand(3)) // Floating point value.
+ .addUse(Variable); // Pointer to integral part.
+ // Assign the integral part stored in the ptr to the second element of the
+ // result.
+ Register IntegralPartReg = I.getOperand(1).getReg();
+ if (IntegralPartReg.isValid()) {
+ // Load the value from the pointer to integral part.
+ auto LoadMIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+ .addDef(IntegralPartReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Variable);
+ return LoadMIB.constrainAllUses(TII, TRI, RBI);
+ }
+
+ return MIB.constrainAllUses(TII, TRI, RBI);
+ } else if (STI.canUseExtInstSet(SPIRV::InstructionSet::GLSL_std_450)) {
+ assert(false && "GLSL::Modf is deprecated.");
+ // FIXME: GL::Modf is deprecated, use Modfstruct instead.
+ return false;
+ }
+ return false;
+}
+
// Generate the instructions to load 3-element vector builtin input
// IDs/Indices.
// Like: GlobalInvocationId, LocalInvocationId, etc....
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 2bffbf7..595424b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -380,7 +380,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
bool Changed = false;
const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
for (BasicBlock &BB : *F) {
- for (Instruction &I : BB) {
+ for (Instruction &I : make_early_inc_range(BB)) {
auto Call = dyn_cast<CallInst>(&I);
if (!Call)
continue;
@@ -408,12 +408,18 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
if (!STI.isShader()) {
Changed |= toSpvOverloadedIntrinsic(
II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+ } else {
+ II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::lifetime_end:
if (!STI.isShader()) {
Changed |= toSpvOverloadedIntrinsic(
II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+ } else {
+ II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::ptr_annotation:
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 768efb9..416d811 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -995,4 +995,27 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
return foldImm(ResType->getOperand(2), MRI);
}
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) {
+ // Find the position to insert the OpVariable instruction.
+ // We will insert it after the last OpFunctionParameter, if any, or
+ // after OpFunction otherwise.
+ MachineBasicBlock::iterator VarPos = BB.begin();
+ while (VarPos != BB.end() && VarPos->getOpcode() != SPIRV::OpFunction) {
+ ++VarPos;
+ }
+ // Advance VarPos to the next instruction after OpFunction, it will either
+ // be an OpFunctionParameter, so that we can start the next loop, or the
+ // position to insert the OpVariable instruction.
+ ++VarPos;
+ while (VarPos != BB.end() &&
+ VarPos->getOpcode() == SPIRV::OpFunctionParameter) {
+ ++VarPos;
+ }
+ // VarPos is now pointing at after the last OpFunctionParameter, if any,
+ // or after OpFunction, if no parameters.
+ return VarPos != BB.end() && VarPos->getOpcode() == SPIRV::OpLabel ? ++VarPos
+ : VarPos;
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index d732188..45c520a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -506,6 +506,8 @@ MachineInstr *getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
int64_t foldImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
const MachineInstr *ResType);
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d8..1aa8efe 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue InGlue;
- Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
InGlue = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
InGlue};
Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
InGlue = Chain.getValue(1);
- Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
InGlue = Chain.getValue(1);
SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bf2e04c..09b8864 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -46,6 +46,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
: TargetLowering(TM), Subtarget(&STI) {
auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+ // Set the load count for memcmp expand optimization
+ MaxLoadsPerMemcmp = 8;
+ MaxLoadsPerMemcmpOptSize = 4;
+
// Booleans always contain 0 or 1.
setBooleanContents(ZeroOrOneBooleanContent);
// Except in SIMD vectors
@@ -2935,6 +2939,25 @@ performVectorExtendToFPCombine(SDNode *N,
}
static SDValue
+performVectorNonNegToFPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+
+ SDNodeFlags Flags = N->getFlags();
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // Optimize uitofp to sitofp when the sign bit is known to be zero.
+ // Depending on the target (runtime) backend, this might be performance
+ // neutral (e.g. AArch64) or a significant improvement (e.g. x86_64).
+ if (VT.isVector() && (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0))) {
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
+
+ return SDValue();
+}
+
+static SDValue
performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
auto &DAG = DCI.DAG;
assert(N->getOpcode() == ISD::SIGN_EXTEND ||
@@ -3515,6 +3538,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
return performVectorExtendCombine(N, DCI);
case ISD::UINT_TO_FP:
+ if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI))
+ return ExtCombine;
+ return performVectorNonNegToFPCombine(N, DCI);
case ISD::SINT_TO_FP:
return performVectorExtendToFPCombine(N, DCI);
case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 4f15999..52e7065 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -141,6 +141,21 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
+WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
+WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+
+ Options.AllowOverlappingLoads = true;
+
+ // TODO: Teach WebAssembly backend about load v128.
+
+ Options.LoadSizes.append({8, 4, 2, 1});
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+
+ return Options;
+}
+
InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index d83b8d1..c915eeb0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -73,6 +73,10 @@ public:
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
+
+ TTI::MemCmpExpansionOptions
+ enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
+
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3d060c6..e213923 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend {
unsigned PrevInstOpcode = 0;
MCBoundaryAlignFragment *PendingBA = nullptr;
std::pair<MCFragment *, size_t> PrevInstPosition;
- bool IsRightAfterData = false;
uint8_t determinePaddingPrefix(const MCInst &Inst) const;
bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -156,10 +155,13 @@ public:
AlignBranchType = X86AlignBranchKindLoc;
if (X86PadMaxPrefixSize.getNumOccurrences())
TargetPrefixMax = X86PadMaxPrefixSize;
+
+ AllowAutoPadding =
+ AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone;
+ AllowEnhancedRelaxation =
+ AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign;
}
- bool allowAutoPadding() const override;
- bool allowEnhancedRelaxation() const override;
void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
const MCSubtargetInfo &STI);
void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst);
@@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) {
return false;
}
-bool X86AsmBackend::allowAutoPadding() const {
- return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
-}
-
-bool X86AsmBackend::allowEnhancedRelaxation() const {
- return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
-}
-
/// X86 has certain instructions which enable interrupts exactly one
/// instruction *after* the instruction which stores to SS. Return true if the
/// given instruction may have such an interrupt delay slot.
@@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// semantic.
return false;
- if (IsRightAfterData)
+ if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
// If this instruction follows any data, there is no clear
// instruction boundary, inserting a nop/prefix would change semantic.
return false;
@@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
(AlignBranchType & X86::AlignBranchIndirect));
}
+void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ bool AutoPadding = S.getAllowAutoPadding();
+ if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) {
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ return;
+ }
+
+ auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
+ Backend.emitInstructionBegin(S, Inst, STI);
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ Backend.emitInstructionEnd(S, Inst);
+}
+
/// Insert BoundaryAlignFragment before instructions to align branches.
void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
const MCInst &Inst, const MCSubtargetInfo &STI) {
- // Used by canPadInst. Done here, because in emitInstructionEnd, the current
- // fragment will have changed.
- IsRightAfterData =
- isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
+ bool CanPadInst = canPadInst(Inst, OS);
+ if (CanPadInst)
+ OS.getCurrentFragment()->setAllowAutoPadding(true);
if (!canPadBranches(OS))
return;
@@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
// we call canPadInst (not cheap) twice. However, in the common case, we can
// avoid unnecessary calls to that, as this is otherwise only used for
// relaxable fragments.
- if (!canPadInst(Inst, OS))
+ if (!CanPadInst)
return;
if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
@@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
/// Set the last fragment to be aligned for the BoundaryAlignFragment.
void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
const MCInst &Inst) {
- MCFragment *CF = OS.getCurrentFragment();
- if (CF->getKind() == MCFragment::FT_Relaxable)
- CF->setAllowAutoPadding(canPadInst(Inst, OS));
-
// Update PrevInstOpcode here, canPadInst() reads that.
+ MCFragment *CF = OS.getCurrentFragment();
PrevInstOpcode = Inst.getOpcode();
PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
@@ -567,11 +571,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// DataFragment, so that we can get the size of instructions later in
// MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
// DataFragment.
- OS.insert(OS.getContext().allocFragment<MCFragment>());
+ OS.newFragment();
// Update the maximum alignment on the current section if necessary.
- MCSection *Sec = OS.getCurrentSectionOnly();
- Sec->ensureMinAlignment(AlignBoundary);
+ CF->getParent()->ensureMinAlignment(AlignBoundary);
}
std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
@@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
continue;
}
- const uint64_t OrigSize = Asm.computeFragmentSize(F);
-
// To keep the effects local, prefer to relax instructions closest to
// the align directive. This is purely about human understandability
// of the resulting code. If we later find a reason to expand
// particular instructions over others, we can adjust.
- unsigned RemainingSize = OrigSize;
+ unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize();
while (!Relaxable.empty() && RemainingSize != 0) {
auto &RF = *Relaxable.pop_back_val();
// Give the backend a chance to play any tricks it wishes to increase
@@ -1542,14 +1543,6 @@ public:
};
} // end anonymous namespace
-void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
- const MCSubtargetInfo &STI) {
- auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
- Backend.emitInstructionBegin(S, Inst, STI);
- S.MCObjectStreamer::emitInstruction(Inst, STI);
- Backend.emitInstructionEnd(S, Inst);
-}
-
void X86ELFStreamer::emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) {
X86_MC::emitInstruction(*this, Inst, STI);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index efb951b..e02b556 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -151,6 +151,7 @@ private:
MCSymbol *LazyPointer) override;
void emitCallInstruction(const llvm::MCInst &MCI);
+ void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI);
// Emits a label to mark the next instruction as being relevant to Import Call
// Optimization.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6281124..568a8c4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45059,6 +45059,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
unsigned NumElts = DemandedElts.getBitWidth();
switch (Op.getOpcode()) {
+ case X86ISD::GlobalBaseReg:
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ return true;
case X86ISD::BLENDI:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -45098,27 +45102,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE vector insert/extracts use modulo indices.
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ return false;
// SSE vector multiplies are either inbounds or saturate.
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD:
+ return false;
// SSE vector shifts handle out of bounds shift amounts.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
return false;
- // SSE blends.
+ // SSE blends.
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
- // SSE target shuffles.
+ // SSE target shuffles.
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPI:
case X86ISD::VPERMV3:
return false;
- // SSE comparisons handle all icmp/fcmp cases.
- // TODO: Add CMPM/MM with test coverage.
+ // SSE comparisons handle all icmp/fcmp cases.
+ // TODO: Add CMPM/MM with test coverage.
case X86ISD::CMPP:
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 45d596b..481a9be 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+ maybeEmitNopAfterCallForWindowsEH(&MI);
}
// Record our statepoint node in the same section used by STACKMAP
@@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
OutStreamer->emitLabel(FallthroughLabel);
}
-// Returns instruction preceding MBBI in MachineFunction.
-// If MBBI is the first instruction of the first basic block, returns null.
-static MachineBasicBlock::const_iterator
-PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
- const MachineBasicBlock *MBB = MBBI->getParent();
- while (MBBI == MBB->begin()) {
- if (MBB == &MBB->getParent()->front())
- return MachineBasicBlock::const_iterator();
- MBB = MBB->getPrevNode();
- MBBI = MBB->end();
- }
- --MBBI;
- return MBBI;
-}
-
static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
if (X86II::isKMasked(MI->getDesc().TSFlags)) {
// Skip mask operand.
@@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
OutStreamer->AddComment("EVEX TO EVEX Compression ", false);
}
+ // We use this to suppress NOP padding for Windows EH.
+ bool IsTailJump = false;
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr:
@@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPm64_REX:
@@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr64_REX: {
@@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
}
@@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::SEH_BeginEpilogue: {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- // Windows unwinder will not invoke function's exception handler if IP is
- // either in prologue or in epilogue. This behavior causes a problem when a
- // call immediately precedes an epilogue, because the return address points
- // into the epilogue. To cope with that, we insert a 'nop' if it ends up
- // immediately after a CALL in the final emitted code.
- MachineBasicBlock::const_iterator MBBI(MI);
- // Check if preceded by a call and emit nop if so.
- for (MBBI = PrevCrossBBInst(MBBI);
- MBBI != MachineBasicBlock::const_iterator();
- MBBI = PrevCrossBBInst(MBBI)) {
- // Pseudo instructions that aren't a call are assumed to not emit any
- // code. If they do, we worst case generate unnecessary noops after a
- // call.
- if (MBBI->isCall() || !MBBI->isPseudo()) {
- if (MBBI->isCall())
- EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
- break;
- }
- }
-
EmitSEHInstruction(MI);
return;
}
@@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
emitCallInstruction(TmpInst);
emitNop(*OutStreamer, 5, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// For Import Call Optimization to work, we need a 3-byte nop after the
// call instruction.
emitNop(*OutStreamer, 3, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
break;
@@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (MI->isCall()) {
emitCallInstruction(TmpInst);
+ // Since tail calls transfer control without leaving a stack frame, there is
+ // never a need for NOP padding tail calls.
+ if (!IsTailJump)
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
+// Determines whether a NOP is required after a CALL, so that Windows EH
+// IP2State tables have the correct information.
+//
+// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32),
+// exception handling works by looking up instruction pointers in lookup
+// tables. These lookup tables are stored in .xdata sections in executables.
+// One element of the lookup tables are the "IP2State" tables (Instruction
+// Pointer to State).
+//
+// If a function has any instructions that require cleanup during exception
+// unwinding, then it will have an IP2State table. Each entry in the IP2State
+// table describes a range of bytes in the function's instruction stream, and
+// associates an "EH state number" with that range of instructions. A value of
+// -1 means "the null state", which does not require any code to execute.
+// A value other than -1 is an index into the State table.
+//
+// The entries in the IP2State table contain byte offsets within the instruction
+// stream of the function. The Windows ABI requires that these offsets are
+// aligned to instruction boundaries; they are not permitted to point to a byte
+// that is not the first byte of an instruction.
+//
+// Unfortunately, CALL instructions present a problem during unwinding. CALL
+// instructions push the address of the instruction after the CALL instruction,
+// so that execution can resume after the CALL. If the CALL is the last
+// instruction within an IP2State region, then the return address (on the stack)
+// points to the *next* IP2State region. This means that the unwinder will
+// use the wrong cleanup funclet during unwinding.
+//
+// To fix this problem, the Windows AMD64 ABI requires that CALL instructions
+// are never placed at the end of an IP2State region. Stated equivalently, the
+// end of a CALL instruction cannot be aligned to an IP2State boundary. If a
+// CALL instruction would occur at the end of an IP2State region, then the
+// compiler must insert a NOP instruction after the CALL. The NOP instruction
+// is placed in the same EH region as the CALL instruction, so that the return
+// address points to the NOP and the unwinder will locate the correct region.
+//
+// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32,
+// instructions have a fixed size so the unwinder knows how to "back up" by
+// one instruction.
+//
+// Interaction with Import Call Optimization (ICO):
+//
+// Import Call Optimization (ICO) is a compiler + OS feature on Windows which
+// improves the performance and security of DLL imports. ICO relies on using a
+// specific CALL idiom that can be replaced by the OS DLL loader. This removes
+// a load and indirect CALL and replaces it with a single direct CALL.
+//
+// To achieve this, ICO also inserts NOPs after the CALL instruction. If the
+// end of the CALL is aligned with an EH state transition, we *also* insert
+// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot
+// be combined into a single larger NOP; nor can the second NOP be removed.
+//
+// This is necessary because, if ICO is active and the call site is modified
+// by the loader, the loader will end up overwriting the NOPs that were inserted
+// for ICO. That means that those NOPs cannot be used for the correct
+// termination of the exception handling region (the IP2State transition),
+// so we still need an additional NOP instruction. The NOPs cannot be combined
+// into a longer NOP (which is ordinarily desirable) because then ICO would
+// split one instruction, producing a malformed instruction after the ICO call.
+void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) {
+ // We only need to insert NOPs after CALLs when targeting Windows on AMD64.
+ // (Don't let the name fool you: Itanium refers to table-based exception
+ // handling, not the Itanium architecture.)
+ if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH ||
+ MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) {
+ return;
+ }
+
+ bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr;
+
+ // Set up MBB iterator, initially positioned on the same MBB as MI.
+ MachineFunction::const_iterator MFI(MI->getParent());
+ MachineFunction::const_iterator MFE(MF->end());
+
+ // Set up instruction iterator, positioned immediately *after* MI.
+ MachineBasicBlock::const_iterator MBBI(MI);
+ MachineBasicBlock::const_iterator MBBE = MI->getParent()->end();
+ ++MBBI; // Step over MI
+
+ // This loop iterates MBBs
+ for (;;) {
+ // This loop iterates instructions
+ for (; MBBI != MBBE; ++MBBI) {
+ // Check the instruction that follows this CALL.
+ const MachineInstr &NextMI = *MBBI;
+
+ // If there is an EH_LABEL after this CALL, then there is an EH state
+ // transition after this CALL. This is exactly the situation which
+ // requires NOP padding.
+ if (NextMI.isEHLabel()) {
+ if (HasEHPersonality) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+ // We actually want to continue, in case there is an SEH_BeginEpilogue
+ // instruction after the EH_LABEL. In some situations, IR is produced
+ // that contains EH_LABEL pseudo-instructions, even when we are not
+ // generating IP2State tables. We still need to insert a NOP before
+ // SEH_BeginEpilogue in that case.
+ continue;
+ }
+
+ // Somewhat similarly, if the CALL is the last instruction before the
+ // SEH prologue, then we also need a NOP. This is necessary because the
+ // Windows stack unwinder will not invoke a function's exception handler
+ // if the instruction pointer is in the function prologue or epilogue.
+ //
+ // We always emit a NOP before SEH_BeginEpilogue, even if there is no
+ // personality function (unwind info) for this frame. This is the same
+ // behavior as MSVC.
+ if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+
+ if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) {
+ // We found a real instruction. During the CALL, the return IP will
+ // point to this instruction. Since this instruction has the same EH
+ // state as the call itself (because there is no intervening EH_LABEL),
+ // the IP2State table will be accurate; there is no need to insert a
+ // NOP.
+ return;
+ }
+
+ // The next instruction is a pseudo-op. Ignore it and keep searching.
+ // Because these instructions do not generate any machine code, they
+ // cannot prevent the IP2State table from pointing at the wrong
+ // instruction during a CALL.
+ }
+
+ // We've reached the end of this MBB. Find the next MBB in program order.
+ // MBB order should be finalized by this point, so falling across MBBs is
+ // expected.
+ ++MFI;
+ if (MFI == MFE) {
+ // No more blocks; we've reached the end of the function. This should
+ // only happen with no-return functions, but double-check to be sure.
+ if (HasEHPersonality) {
+ // If the CALL has no successors, then it is a noreturn function.
+ // Insert an INT3 instead of a NOP. This accomplishes the same purpose,
+ // but is more clear to read. Also, analysis tools will understand
+ // that they should not continue disassembling after the CALL (unless
+ // there are other branches to that label).
+ if (MI->getParent()->succ_empty())
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ else
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ }
+ return;
+ }
+
+ // Set up iterator to scan the next basic block.
+ const MachineBasicBlock *NextMBB = &*MFI;
+ MBBI = NextMBB->instr_begin();
+ MBBE = NextMBB->instr_end();
+ }
+}
+
void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
ImportCallKind Kind) {
assert(EnableImportCallOptimization);
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 8c156c9..7fa6e6c5 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -842,6 +842,156 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
return true;
}
+/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
+struct PartStore {
+ Value *PtrBase;
+ APInt PtrOffset;
+ Value *Val;
+ uint64_t ValOffset;
+ uint64_t ValWidth;
+ StoreInst *Store;
+
+ bool isCompatibleWith(const PartStore &Other) const {
+ return PtrBase == Other.PtrBase && Val == Other.Val;
+ }
+
+ bool operator<(const PartStore &Other) const {
+ return PtrOffset.slt(Other.PtrOffset);
+ }
+};
+
+static std::optional<PartStore> matchPartStore(Instruction &I,
+ const DataLayout &DL) {
+ auto *Store = dyn_cast<StoreInst>(&I);
+ if (!Store || !Store->isSimple())
+ return std::nullopt;
+
+ Value *StoredVal = Store->getValueOperand();
+ Type *StoredTy = StoredVal->getType();
+ if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
+ return std::nullopt;
+
+ uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
+ uint64_t ValOffset = 0;
+ Value *Val;
+ if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
+ m_ConstantInt(ValOffset))),
+ m_Trunc(m_Value(Val)))))
+ return std::nullopt;
+
+ Value *Ptr = Store->getPointerOperand();
+ APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
+ DL, PtrOffset, /*AllowNonInbounds=*/true);
+ return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
+}
+
+static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
+ unsigned Width, const DataLayout &DL,
+ TargetTransformInfo &TTI) {
+ if (Parts.size() < 2)
+ return false;
+
+ // Check whether combining the stores is profitable.
+ // FIXME: We could generate smaller stores if we can't produce a large one.
+ const PartStore &First = Parts.front();
+ LLVMContext &Ctx = First.Store->getContext();
+ Type *NewTy = Type::getIntNTy(Ctx, Width);
+ unsigned Fast = 0;
+ if (!TTI.isTypeLegal(NewTy) ||
+ !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
+ First.Store->getPointerAddressSpace(),
+ First.Store->getAlign(), &Fast) ||
+ !Fast)
+ return false;
+
+ // Generate the combined store.
+ IRBuilder<> Builder(First.Store);
+ Value *Val = First.Val;
+ if (First.ValOffset != 0)
+ Val = Builder.CreateLShr(Val, First.ValOffset);
+ Val = Builder.CreateTrunc(Val, NewTy);
+ StoreInst *Store = Builder.CreateAlignedStore(
+ Val, First.Store->getPointerOperand(), First.Store->getAlign());
+
+ AAMDNodes AATags = First.Store->getAAMetadata();
+ for (const PartStore &Part : drop_begin(Parts))
+ AATags = AATags.concat(Part.Store->getAAMetadata());
+ Store->setAAMetadata(AATags);
+
+ // Remove the old stores.
+ for (const PartStore &Part : Parts)
+ Part.Store->eraseFromParent();
+
+ return true;
+}
+
+static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
+ const DataLayout &DL, TargetTransformInfo &TTI) {
+ if (Parts.size() < 2)
+ return false;
+
+ // We now have multiple parts of the same value stored to the same pointer.
+ // Sort the parts by pointer offset, and make sure they are consistent with
+ // the value offsets. Also check that the value is fully covered without
+ // overlaps.
+ bool Changed = false;
+ llvm::sort(Parts);
+ int64_t LastEndOffsetFromFirst = 0;
+ const PartStore *First = &Parts[0];
+ for (const PartStore &Part : Parts) {
+ APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
+ int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
+ if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
+ LastEndOffsetFromFirst != ValOffsetFromFirst) {
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
+ LastEndOffsetFromFirst, DL, TTI);
+ First = &Part;
+ LastEndOffsetFromFirst = Part.ValWidth;
+ continue;
+ }
+
+ LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
+ }
+
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
+ LastEndOffsetFromFirst, DL, TTI);
+ return Changed;
+}
+
+static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
+ TargetTransformInfo &TTI, AliasAnalysis &AA) {
+ // FIXME: Add big endian support.
+ if (DL.isBigEndian())
+ return false;
+
+ SmallVector<PartStore, 8> Parts;
+ bool MadeChange = false;
+ for (Instruction &I : make_early_inc_range(BB)) {
+ if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
+ if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
+ Parts.push_back(std::move(*Part));
+ continue;
+ }
+
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ Parts.clear();
+ Parts.push_back(std::move(*Part));
+ continue;
+ }
+
+ // FIXME: Use AA to make this more precise.
+ if (I.mayReadOrWriteMemory() || I.mayThrow()) {
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ Parts.clear();
+ continue;
+ }
+ }
+
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ return MadeChange;
+}
+
/// Combine away instructions providing they are still equivalent when compared
/// against 0. i.e do they have any bits set.
static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) {
@@ -1330,6 +1480,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
}
+
+ // Do this separately to avoid redundantly scanning stores multiple times.
+ MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
}
// We're done with transforms, so remove dead instructions.
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e279fec..6561b1c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -170,6 +170,12 @@ void Lowerer::hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin) {
auto *PI = Builder.CreateIntrinsic(
Builder.getPtrTy(), Intrinsic::coro_promise, Arg, {}, "promise.addr");
PI->setCannotDuplicate();
+ // Remove lifetime markers, as these are only allowed on allocas.
+ for (User *U : make_early_inc_range(PA->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
PA->replaceUsesWithIf(PI, [CoroId](Use &U) {
bool IsBitcast = U == U.getUser()->stripPointerCasts();
bool IsCoroId = U.getUser() == CoroId;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index a65d0fb..3320508 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -553,7 +553,6 @@ static void cacheDIVar(FrameDataInfo &FrameData,
if (I != Container.end())
DIVarCache.insert({V, (*I)->getVariable()});
};
- CacheIt(findDbgDeclares(V));
CacheIt(findDVRDeclares(V));
}
}
@@ -1219,10 +1218,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
auto *G = GetFramePointer(Alloca);
G->setName(Alloca->getName() + Twine(".reload.addr"));
- SmallVector<DbgVariableIntrinsic *, 4> DIs;
SmallVector<DbgVariableRecord *> DbgVariableRecords;
- findDbgUsers(DIs, Alloca, &DbgVariableRecords);
- assert(DIs.empty() && "Should never see debug-intrinsics");
+ findDbgUsers(Alloca, DbgVariableRecords);
for (auto *DVR : DbgVariableRecords)
DVR->replaceVariableLocationOp(Alloca, G);
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 5fd5f7d..4e71768 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -519,10 +519,8 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
// We would handle the dbg.values for allocas specially
for (auto &Iter : Spills) {
auto *V = Iter.first;
- SmallVector<DbgValueInst *, 16> DVIs;
SmallVector<DbgVariableRecord *, 16> DVRs;
- findDbgValues(DVIs, V, &DVRs);
- assert(DVIs.empty());
+ findDbgValues(V, DVRs);
// Add the instructions which carry debug info that is in the frame.
for (DbgVariableRecord *DVR : DVRs)
if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 469f435..b803c97 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3998,6 +3998,24 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
}
+// Update the debug information attached to NewFunc to use the clone Name. Note
+// this needs to be done for both any existing DISubprogram for the definition,
+// as well as any separate declaration DISubprogram.
+static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
+ assert(Name == NewFunc->getName());
+ auto *SP = NewFunc->getSubprogram();
+ if (!SP)
+ return;
+ auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
+ SP->replaceLinkageName(MDName);
+ DISubprogram *Decl = SP->getDeclaration();
+ if (!Decl)
+ return;
+ TempDISubprogram NewDecl = Decl->clone();
+ NewDecl->replaceLinkageName(MDName);
+ SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
+}
+
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
@@ -4009,9 +4027,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
assert(!Func.func()->getParent()->getFunction(Name));
NewFunc->setName(Name);
- if (auto *SP = NewFunc->getSubprogram())
- SP->replaceLinkageName(
- MDString::get(NewFunc->getParent()->getContext(), Name));
+ updateSubprogramLinkageName(NewFunc, Name);
for (auto &Inst : CallsWithMetadataInFunc) {
// This map always has the initial version in it.
assert(Inst.cloneNo() == 0);
@@ -4950,9 +4966,7 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
PrevF->eraseFromParent();
} else
NewF->setName(Name);
- if (auto *SP = NewF->getSubprogram())
- SP->replaceLinkageName(
- MDString::get(NewF->getParent()->getContext(), Name));
+ updateSubprogramLinkageName(NewF, Name);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
<< "created clone " << ore::NV("NewFunction", NewF));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3321435..d88bc2c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4352,6 +4352,13 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
}
+ // Drop unnecessary callee_type metadata from calls that were converted
+ // into direct calls.
+ if (Call.getMetadata(LLVMContext::MD_callee_type) && !Call.isIndirectCall()) {
+ Call.setMetadata(LLVMContext::MD_callee_type, nullptr);
+ Changed = true;
+ }
+
// Drop unnecessary kcfi operand bundles from calls that were converted
// into direct calls.
auto Bundle = Call.getOperandBundle(LLVMContext::OB_kcfi);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 9df0855..c90ff2a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
@@ -21,8 +22,10 @@
#include "llvm/Analysis/Utils/Local.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
@@ -8222,6 +8225,98 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
}
+// Transform 'fptrunc(x) cmp C' to 'x cmp ext(C)' if possible.
+// Patterns include:
+// fptrunc(x) < C --> x < ext(C)
+// fptrunc(x) <= C --> x <= ext(C)
+// fptrunc(x) > C --> x > ext(C)
+// fptrunc(x) >= C --> x >= ext(C)
+// where 'ext(C)' is the extension of 'C' to the type of 'x' with a small bias
+// due to precision loss.
+static Instruction *foldFCmpFpTrunc(FCmpInst &I, const Instruction &FPTrunc,
+ const Constant &C) {
+ FCmpInst::Predicate Pred = I.getPredicate();
+ bool RoundDown = false;
+
+ if (Pred == FCmpInst::FCMP_OGE || Pred == FCmpInst::FCMP_UGE ||
+ Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_ULT)
+ RoundDown = true;
+ else if (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT ||
+ Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)
+ RoundDown = false;
+ else
+ return nullptr;
+
+ const APFloat *CValue;
+ if (!match(&C, m_APFloat(CValue)))
+ return nullptr;
+
+ if (CValue->isNaN() || CValue->isInfinity())
+ return nullptr;
+
+ auto ConvertFltSema = [](const APFloat &Src, const fltSemantics &Sema) {
+ bool LosesInfo;
+ APFloat Dest = Src;
+ Dest.convert(Sema, APFloat::rmNearestTiesToEven, &LosesInfo);
+ return Dest;
+ };
+
+ auto NextValue = [](const APFloat &Value, bool RoundDown) {
+ APFloat NextValue = Value;
+ NextValue.next(RoundDown);
+ return NextValue;
+ };
+
+ APFloat NextCValue = NextValue(*CValue, RoundDown);
+
+ Type *DestType = FPTrunc.getOperand(0)->getType();
+ const fltSemantics &DestFltSema =
+ DestType->getScalarType()->getFltSemantics();
+
+ APFloat ExtCValue = ConvertFltSema(*CValue, DestFltSema);
+ APFloat ExtNextCValue = ConvertFltSema(NextCValue, DestFltSema);
+
+ // When 'NextCValue' is infinity, use an imaged 'NextCValue' that equals
+ // 'CValue + bias' to avoid the infinity after conversion. The bias is
+ // estimated as 'CValue - PrevCValue', where 'PrevCValue' is the previous
+ // value of 'CValue'.
+ if (NextCValue.isInfinity()) {
+ APFloat PrevCValue = NextValue(*CValue, !RoundDown);
+ APFloat Bias = ConvertFltSema(*CValue - PrevCValue, DestFltSema);
+
+ ExtNextCValue = ExtCValue + Bias;
+ }
+
+ APFloat ExtMidValue =
+ scalbn(ExtCValue + ExtNextCValue, -1, APFloat::rmNearestTiesToEven);
+
+ const fltSemantics &SrcFltSema =
+ C.getType()->getScalarType()->getFltSemantics();
+
+ // 'MidValue' might be rounded to 'NextCValue'. Correct it here.
+ APFloat MidValue = ConvertFltSema(ExtMidValue, SrcFltSema);
+ if (MidValue != *CValue)
+ ExtMidValue.next(!RoundDown);
+
+ // Check whether 'ExtMidValue' is a valid result since the assumption on
+ // imaged 'NextCValue' might not hold for new float types.
+ // ppc_fp128 can't pass here when converting from max float because of
+ // APFloat implementation.
+ if (NextCValue.isInfinity()) {
+ // ExtMidValue --- narrowed ---> Finite
+ if (ConvertFltSema(ExtMidValue, SrcFltSema).isInfinity())
+ return nullptr;
+
+ // NextExtMidValue --- narrowed ---> Infinity
+ APFloat NextExtMidValue = NextValue(ExtMidValue, RoundDown);
+ if (ConvertFltSema(NextExtMidValue, SrcFltSema).isFinite())
+ return nullptr;
+ }
+
+ return new FCmpInst(Pred, FPTrunc.getOperand(0),
+ ConstantFP::get(DestType, ExtMidValue), "", &I);
+}
+
/// Optimize fabs(X) compared with zero.
static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
Value *X;
@@ -8712,6 +8807,10 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
cast<LoadInst>(LHSI), GEP, GV, I))
return Res;
break;
+ case Instruction::FPTrunc:
+ if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC))
+ return NV;
+ break;
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 503611a..e2a9255 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -219,18 +219,64 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs,
GEPNoWrapFlags NW, Type *IdxTy,
bool RewriteGEPs) {
- Value *Sum = nullptr;
- for (GEPOperator *GEP : reverse(GEPs)) {
- Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
- if (Offset->getType() != IdxTy)
- Offset = Builder.CreateVectorSplat(
- cast<VectorType>(IdxTy)->getElementCount(), Offset);
+ auto Add = [&](Value *Sum, Value *Offset) -> Value * {
if (Sum)
- Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
- NW.isInBounds());
+ return Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
+ NW.isInBounds());
else
- Sum = Offset;
+ return Offset;
+ };
+
+ Value *Sum = nullptr;
+ Value *OneUseSum = nullptr;
+ Value *OneUseBase = nullptr;
+ GEPNoWrapFlags OneUseFlags = GEPNoWrapFlags::all();
+ for (GEPOperator *GEP : reverse(GEPs)) {
+ Value *Offset;
+ {
+ // Expand the offset at the point of the previous GEP to enable rewriting.
+ // However, use the original insertion point for calculating Sum.
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ auto *Inst = dyn_cast<Instruction>(GEP);
+ if (RewriteGEPs && Inst)
+ Builder.SetInsertPoint(Inst);
+
+ Offset = llvm::emitGEPOffset(&Builder, DL, GEP);
+ if (Offset->getType() != IdxTy)
+ Offset = Builder.CreateVectorSplat(
+ cast<VectorType>(IdxTy)->getElementCount(), Offset);
+ if (GEP->hasOneUse()) {
+ // Offsets of one-use GEPs will be merged into the next multi-use GEP.
+ OneUseSum = Add(OneUseSum, Offset);
+ OneUseFlags = OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags());
+ if (!OneUseBase)
+ OneUseBase = GEP->getPointerOperand();
+ continue;
+ }
+
+ if (OneUseSum)
+ Offset = Add(OneUseSum, Offset);
+
+ // Rewrite the GEP to reuse the computed offset. This also includes
+ // offsets from preceding one-use GEPs.
+ if (RewriteGEPs && Inst &&
+ !(GEP->getSourceElementType()->isIntegerTy(8) &&
+ GEP->getOperand(1) == Offset)) {
+ replaceInstUsesWith(
+ *Inst,
+ Builder.CreatePtrAdd(
+ OneUseBase ? OneUseBase : GEP->getPointerOperand(), Offset, "",
+ OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags())));
+ eraseInstFromFunction(*Inst);
+ }
+ }
+
+ Sum = Add(Sum, Offset);
+ OneUseSum = OneUseBase = nullptr;
+ OneUseFlags = GEPNoWrapFlags::all();
}
+ if (OneUseSum)
+ Sum = Add(Sum, OneUseSum);
if (!Sum)
return Constant::getNullValue(IdxTy);
return Sum;
@@ -1417,10 +1463,8 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) {
}
// Update pre-existing debug value uses.
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(I, DbgVariableRecords);
for (DbgVariableRecord *DbgVal : DbgVariableRecords) {
SmallVector<uint64_t, 1> Ops = {dwarf::DW_OP_not};
@@ -3565,12 +3609,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
// before each store.
- SmallVector<DbgVariableIntrinsic *, 8> DVIs;
SmallVector<DbgVariableRecord *, 8> DVRs;
std::unique_ptr<DIBuilder> DIB;
if (isa<AllocaInst>(MI)) {
- findDbgUsers(DVIs, &MI, &DVRs);
- assert(DVIs.empty());
+ findDbgUsers(&MI, DVRs);
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
}
@@ -3692,9 +3734,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
//
// FIXME: the Assignment Tracking project has now likely made this
// redundant (and it's sometimes harmful).
- for (auto *DVI : DVIs)
- if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
- DVI->eraseFromParent();
for (auto *DVR : DVRs)
if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref())
DVR->eraseFromParent();
@@ -5246,10 +5285,8 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
// maximise the range variables have location for. If we cannot salvage, then
// mark the location undef: we know it was supposed to receive a new location
// here, but that computation has been sunk.
- SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
- findDbgUsers(DbgUsers, I, &DbgVariableRecords);
- assert(DbgUsers.empty());
+ findDbgUsers(I, DbgVariableRecords);
if (!DbgVariableRecords.empty())
tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
DbgVariableRecords);
@@ -5376,7 +5413,7 @@ void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
if (DVRClones.empty())
return;
- salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
+ salvageDebugInfoForDbgValues(*I, DbgVariableRecordsToSalvage);
// The clones are in reverse order of original appearance. Assert that the
// head bit is set on the iterator as we _should_ have received it via
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 5957940..fbaa651 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -3637,6 +3637,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
"Variable descriptions relative to ASan stack base will be dropped");
// Replace Alloca instructions with base+offset.
+ SmallVector<Value *> NewAllocaPtrs;
for (const auto &Desc : SVD) {
AllocaInst *AI = Desc.AI;
replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
@@ -3645,6 +3646,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
AI->getType());
AI->replaceAllUsesWith(NewAllocaPtr);
+ NewAllocaPtrs.push_back(NewAllocaPtr);
}
// The left-most redzone has enough space for at least 4 pointers.
@@ -3694,6 +3696,15 @@ void FunctionStackPoisoner::processStaticAllocas() {
}
}
+ // Remove lifetime markers now that these are no longer allocas.
+ for (Value *NewAllocaPtr : NewAllocaPtrs) {
+ for (User *U : make_early_inc_range(NewAllocaPtr->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
+ }
+
SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
SmallVector<uint8_t, 64> ShadowAfterReturn;
@@ -3829,6 +3840,13 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+ // Remove lifetime markers now that this is no longer an alloca.
+ for (User *U : make_early_inc_range(AI->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
+
// Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
AI->replaceAllUsesWith(NewAddressPtr);
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 55f3239..2486e77 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -72,7 +72,7 @@ static void emitRemark(IntrinsicInst *II, OptimizationRemarkEmitter &ORE,
}
}
-static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
+static bool lowerAllowChecks(Function &F, const BlockFrequencyInfo &BFI,
const ProfileSummaryInfo *PSI,
OptimizationRemarkEmitter &ORE,
const LowerAllowCheckPass::Options &Opts) {
@@ -160,7 +160,7 @@ PreservedAnalyses LowerAllowCheckPass::run(Function &F,
OptimizationRemarkEmitter &ORE =
AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- return removeUbsanTraps(F, BFI, PSI, ORE, Opts)
+ return lowerAllowChecks(F, BFI, PSI, ORE, Opts)
// We do not change the CFG, we only replace the intrinsics with
// true or false.
? PreservedAnalyses::none().preserveSet<CFGAnalyses>()
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index e5b357f..a9a0731 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -361,6 +361,131 @@ static void addVPMetadata(Module &M, Instruction &I,
}
}
+static void
+handleAllocSite(Instruction &I, CallBase *CI,
+ ArrayRef<uint64_t> InlinedCallStack, LLVMContext &Ctx,
+ OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
+ const std::set<const AllocationInfo *> &AllocInfoSet,
+ std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
+ &FullStackIdToAllocMatchInfo) {
+ // We may match this instruction's location list to multiple MIB
+ // contexts. Add them to a Trie specialized for trimming the contexts to
+ // the minimal needed to disambiguate contexts with unique behavior.
+ CallStackTrie AllocTrie(&ORE, MaxColdSize);
+ uint64_t TotalSize = 0;
+ uint64_t TotalColdSize = 0;
+ for (auto *AllocInfo : AllocInfoSet) {
+ // Check the full inlined call stack against this one.
+ // If we found and thus matched all frames on the call, include
+ // this MIB.
+ if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
+ InlinedCallStack)) {
+ NumOfMemProfMatchedAllocContexts++;
+ uint64_t FullStackId = 0;
+ if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
+ FullStackId = computeFullStackId(AllocInfo->CallStack);
+ auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
+ TotalSize += AllocInfo->Info.getTotalSize();
+ if (AllocType == AllocationType::Cold)
+ TotalColdSize += AllocInfo->Info.getTotalSize();
+ // Record information about the allocation if match info printing
+ // was requested.
+ if (ClPrintMemProfMatchInfo) {
+ assert(FullStackId != 0);
+ FullStackIdToAllocMatchInfo[std::make_pair(FullStackId,
+ InlinedCallStack.size())] = {
+ AllocInfo->Info.getTotalSize(), AllocType};
+ }
+ }
+ }
+ // If the threshold for the percent of cold bytes is less than 100%,
+ // and not all bytes are cold, see if we should still hint this
+ // allocation as cold without context sensitivity.
+ if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
+ TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
+ AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, "dominant");
+ return;
+ }
+
+ // We might not have matched any to the full inlined call stack.
+ // But if we did, create and attach metadata, or a function attribute if
+ // all contexts have identical profiled behavior.
+ if (!AllocTrie.empty()) {
+ NumOfMemProfMatchedAllocs++;
+ // MemprofMDAttached will be false if a function attribute was
+ // attached.
+ bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
+ assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
+ if (MemprofMDAttached) {
+ // Add callsite metadata for the instruction's location list so that
+ // it simpler later on to identify which part of the MIB contexts
+ // are from this particular instruction (including during inlining,
+ // when the callsite metadata will be updated appropriately).
+ // FIXME: can this be changed to strip out the matching stack
+ // context ids from the MIB contexts and not add any callsite
+ // metadata here to save space?
+ addCallsiteMetadata(I, InlinedCallStack, Ctx);
+ }
+ }
+}
+
+// Helper struct for maintaining refs to callsite data. As an alternative we
+// could store a pointer to the CallSiteInfo struct but we also need the frame
+// index. Using ArrayRefs instead makes it a little easier to read.
+struct CallSiteEntry {
+ // Subset of frames for the corresponding CallSiteInfo.
+ ArrayRef<Frame> Frames;
+ // Potential targets for indirect calls.
+ ArrayRef<GlobalValue::GUID> CalleeGuids;
+
+ // Only compare Frame contents.
+ // Use pointer-based equality instead of ArrayRef's operator== which does
+ // element-wise comparison. We want to check if it's the same slice of the
+ // underlying array, not just equivalent content.
+ bool operator==(const CallSiteEntry &Other) const {
+ return Frames.data() == Other.Frames.data() &&
+ Frames.size() == Other.Frames.size();
+ }
+};
+
+struct CallSiteEntryHash {
+ size_t operator()(const CallSiteEntry &Entry) const {
+ return computeFullStackId(Entry.Frames);
+ }
+};
+
+static void handleCallSite(
+ Instruction &I, const Function *CalledFunction,
+ ArrayRef<uint64_t> InlinedCallStack,
+ const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries,
+ Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites) {
+ auto &Ctx = M.getContext();
+ for (const auto &CallSiteEntry : CallSiteEntries) {
+ // If we found and thus matched all frames on the call, create and
+ // attach call stack metadata.
+ if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
+ InlinedCallStack)) {
+ NumOfMemProfMatchedCallSites++;
+ addCallsiteMetadata(I, InlinedCallStack, Ctx);
+
+ // Try to attach indirect call metadata if possible.
+ if (!CalledFunction)
+ addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
+
+ // Only need to find one with a matching call stack and add a single
+ // callsite metadata.
+
+ // Accumulate call site matching information upon request.
+ if (ClPrintMemProfMatchInfo) {
+ std::vector<uint64_t> CallStack;
+ append_range(CallStack, InlinedCallStack);
+ MatchedCallSites.insert(std::move(CallStack));
+ }
+ break;
+ }
+ }
+}
+
static void readMemprof(Module &M, Function &F,
IndexedInstrProfReader *MemProfReader,
const TargetLibraryInfo &TLI,
@@ -431,31 +556,6 @@ static void readMemprof(Module &M, Function &F,
// (allocation info and the callsites).
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
- // Helper struct for maintaining refs to callsite data. As an alternative we
- // could store a pointer to the CallSiteInfo struct but we also need the frame
- // index. Using ArrayRefs instead makes it a little easier to read.
- struct CallSiteEntry {
- // Subset of frames for the corresponding CallSiteInfo.
- ArrayRef<Frame> Frames;
- // Potential targets for indirect calls.
- ArrayRef<GlobalValue::GUID> CalleeGuids;
-
- // Only compare Frame contents.
- // Use pointer-based equality instead of ArrayRef's operator== which does
- // element-wise comparison. We want to check if it's the same slice of the
- // underlying array, not just equivalent content.
- bool operator==(const CallSiteEntry &Other) const {
- return Frames.data() == Other.Frames.data() &&
- Frames.size() == Other.Frames.size();
- }
- };
-
- struct CallSiteEntryHash {
- size_t operator()(const CallSiteEntry &Entry) const {
- return computeFullStackId(Entry.Frames);
- }
- };
-
// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added) along with their CalleeGuids.
std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
@@ -553,100 +653,15 @@ static void readMemprof(Module &M, Function &F,
// allocation context with the same leaf.
if (AllocInfoIter != LocHashToAllocInfo.end() &&
// Only consider allocations which support hinting.
- isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) {
- // We may match this instruction's location list to multiple MIB
- // contexts. Add them to a Trie specialized for trimming the contexts to
- // the minimal needed to disambiguate contexts with unique behavior.
- CallStackTrie AllocTrie(&ORE, MaxColdSize);
- uint64_t TotalSize = 0;
- uint64_t TotalColdSize = 0;
- for (auto *AllocInfo : AllocInfoIter->second) {
- // Check the full inlined call stack against this one.
- // If we found and thus matched all frames on the call, include
- // this MIB.
- if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
- InlinedCallStack)) {
- NumOfMemProfMatchedAllocContexts++;
- uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
- FullStackId = computeFullStackId(AllocInfo->CallStack);
- auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
- TotalSize += AllocInfo->Info.getTotalSize();
- if (AllocType == AllocationType::Cold)
- TotalColdSize += AllocInfo->Info.getTotalSize();
- // Record information about the allocation if match info printing
- // was requested.
- if (ClPrintMemProfMatchInfo) {
- assert(FullStackId != 0);
- FullStackIdToAllocMatchInfo[std::make_pair(
- FullStackId, InlinedCallStack.size())] = {
- AllocInfo->Info.getTotalSize(), AllocType};
- }
- }
- }
- // If the threshold for the percent of cold bytes is less than 100%,
- // and not all bytes are cold, see if we should still hint this
- // allocation as cold without context sensitivity.
- if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
- TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
- AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold,
- "dominant");
- continue;
- }
-
- // We might not have matched any to the full inlined call stack.
- // But if we did, create and attach metadata, or a function attribute if
- // all contexts have identical profiled behavior.
- if (!AllocTrie.empty()) {
- NumOfMemProfMatchedAllocs++;
- // MemprofMDAttached will be false if a function attribute was
- // attached.
- bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
- assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
- if (MemprofMDAttached) {
- // Add callsite metadata for the instruction's location list so that
- // it simpler later on to identify which part of the MIB contexts
- // are from this particular instruction (including during inlining,
- // when the callsite metadata will be updated appropriately).
- // FIXME: can this be changed to strip out the matching stack
- // context ids from the MIB contexts and not add any callsite
- // metadata here to save space?
- addCallsiteMetadata(I, InlinedCallStack, Ctx);
- }
- }
- continue;
- }
-
- if (CallSitesIter == LocHashToCallSites.end())
- continue;
-
- // Otherwise, add callsite metadata. If we reach here then we found the
- // instruction's leaf location in the callsites map and not the allocation
- // map.
- for (const auto &CallSiteEntry : CallSitesIter->second) {
- // If we found and thus matched all frames on the call, create and
- // attach call stack metadata.
- if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
- InlinedCallStack)) {
- NumOfMemProfMatchedCallSites++;
- addCallsiteMetadata(I, InlinedCallStack, Ctx);
-
- // Try to attach indirect call metadata if possible.
- if (!CalledFunction)
- addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
-
- // Only need to find one with a matching call stack and add a single
- // callsite metadata.
-
- // Accumulate call site matching information upon request.
- if (ClPrintMemProfMatchInfo) {
- std::vector<uint64_t> CallStack;
- append_range(CallStack, InlinedCallStack);
- MatchedCallSites.insert(std::move(CallStack));
- }
- break;
- }
- }
+ isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI))
+ handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize,
+ AllocInfoIter->second, FullStackIdToAllocMatchInfo);
+ else if (CallSitesIter != LocHashToCallSites.end())
+ // Otherwise, add callsite metadata. If we reach here then we found the
+ // instruction's leaf location in the callsites map and not the
+ // allocation map.
+ handleCallSite(I, CalledFunction, InlinedCallStack,
+ CallSitesIter->second, M, MatchedCallSites);
}
}
}
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index df31602..1ddb8ae 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1486,10 +1486,8 @@ static bool checkAndReplaceCondition(
// Update the debug value records that satisfy the same condition used
// in replaceUsesWithIf.
- SmallVector<DbgVariableIntrinsic *> DbgUsers;
SmallVector<DbgVariableRecord *> DVRUsers;
- findDbgUsers(DbgUsers, Cmp, &DVRUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(Cmp, DVRUsers);
for (auto *DVR : DVRUsers) {
auto *DTN = DT.getNode(DVR->getParent());
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 66836ef..85ee824 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -430,6 +430,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
+ // Always force lifetime markers to work directly on the alloca.
+ NewV = NewV->stripPointerCasts();
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
M, II->getIntrinsicID(), {NewV->getType()});
II->setArgOperand(1, NewV);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 4d1f4407..c2a737d 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1960,7 +1960,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
// PHI insertion, of which we are prepared to do, clean these up now.
SSAUpdater SSAUpdate;
SmallVector<Use *, 16> UsesToRename;
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
for (Instruction &I : *BB) {
@@ -1978,8 +1977,7 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
}
// Find debug values outside of the block
- findDbgValues(DbgValues, &I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(&I, DbgVariableRecords);
llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) {
return DbgVarRec->getParent() == BB;
});
@@ -2000,7 +1998,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
if (!DbgVariableRecords.empty()) {
SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords);
- DbgValues.clear();
DbgVariableRecords.clear();
}
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 221094f..b9546c5 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -128,6 +128,8 @@ private:
// from any other block. So this variable set to true means that loop's latch
// has become unreachable from loop header.
bool DeleteCurrentLoop = false;
+ // Whether or not we enter the loop through an indirectbr.
+ bool HasIndirectEntry = false;
// The blocks of the original loop that will still be reachable from entry
// after the constant folding.
@@ -216,6 +218,19 @@ private:
return;
}
+ // We need a loop preheader to split in handleDeadExits(). If LoopSimplify
+ // wasn't able to form one because the loop can be entered through an
+ // indirectbr we cannot continue.
+ if (!L.getLoopPreheader()) {
+ assert(any_of(predecessors(L.getHeader()),
+ [&](BasicBlock *Pred) {
+ return isa<IndirectBrInst>(Pred->getTerminator());
+ }) &&
+ "Loop should have preheader if it is not entered indirectly");
+ HasIndirectEntry = true;
+ return;
+ }
+
// Collect live and dead loop blocks and exits.
LiveLoopBlocks.insert(L.getHeader());
for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
@@ -546,6 +561,12 @@ public:
return false;
}
+ if (HasIndirectEntry) {
+ LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not"
+ " supported!\n");
+ return false;
+ }
+
// Nothing to constant-fold.
if (FoldCandidates.empty()) {
LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 7eeaaa0..6a3f656 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -82,6 +82,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
@@ -3044,6 +3045,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
if (isInstructionTriviallyDead(&I, TLI)) {
InstrDFS[&I] = 0;
LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ salvageDebugInfo(I);
markInstructionForDeletion(&I);
continue;
}
@@ -4076,6 +4078,12 @@ bool NewGVN::eliminateInstructions(Function &F) {
if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>()))
patchReplacementInstruction(DefI, DominatingLeader);
+ SmallVector<DbgVariableRecord *> DVRUsers;
+ findDbgUsers(DefI, DVRUsers);
+
+ for (auto *DVR : DVRUsers)
+ DVR->replaceVariableLocationOp(DefI, DominatingLeader);
+
markInstructionForDeletion(DefI);
}
}
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 820c8e1..ced61cb 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1105,7 +1105,9 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
Res.push_back(ResElem);
}
- gather(&EVI, Res, *VS);
+ Type *ActualVecType = cast<FixedVectorType>(OpTy->getContainedType(Index));
+ std::optional<VectorSplit> AVS = getVectorSplit(ActualVecType);
+ gather(&EVI, Res, *AVS);
return true;
}
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 1d1af42..7a9dd37 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1219,10 +1219,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
/// \p F.
static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
for (Instruction &I : instructions(F)) {
- SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
- assert(DbgUsers.empty());
+ findDbgUsers(&I, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords)
if (DVR->getFunction() != &F)
DVR->eraseFromParent();
@@ -1284,10 +1282,8 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
NewFunc.getEntryBlock().getTerminator()->getIterator());
};
for (auto [Input, NewVal] : zip_equal(Inputs, NewValues)) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, Input, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(Input, DPUsers);
DIExpression *Expr = DIB.createExpression();
// Iterate the debud users of the Input values. If they are in the extracted
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index c3c3cdf..8d18c75 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -243,26 +243,10 @@ formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
SSAUpdate.RewriteUse(*UseToRewrite);
}
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
+ llvm::findDbgValues(I, DbgVariableRecords);
// Update pre-existing debug value uses that reside outside the loop.
- for (auto *DVI : DbgValues) {
- BasicBlock *UserBB = DVI->getParent();
- if (InstBB == UserBB || L->contains(UserBB))
- continue;
- // We currently only handle debug values residing in blocks that were
- // traversed while rewriting the uses. If we inserted just a single PHI,
- // we will handle all relevant debug values.
- Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
- : SSAUpdate.FindValueForBlock(UserBB);
- if (V)
- DVI->replaceVariableLocationOp(I, V);
- }
-
- // RemoveDIs: copy-paste of block above, using non-instruction debug-info
- // records.
for (DbgVariableRecord *DVR : DbgVariableRecords) {
BasicBlock *UserBB = DVR->getMarker()->getParent();
if (InstBB == UserBB || L->contains(UserBB))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d481ad9..f89d36f 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -610,10 +610,8 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
}
bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, I, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(I, DPUsers);
for (auto *DVR : DPUsers)
DVR->setKillLocation();
return !DPUsers.empty();
@@ -1603,10 +1601,8 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
// Since we can't guarantee that the original dbg.declare intrinsic
// is removed by LowerDbgDeclare(), we need to make sure that we are
// not inserting the same dbg.value intrinsic over and over.
- SmallVector<DbgValueInst *, 1> DbgValues;
SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
- findDbgValues(DbgValues, APN, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(APN, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords) {
assert(is_contained(DVR->location_ops(), APN));
if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr))
@@ -1987,10 +1983,8 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
DIBuilder &Builder, int Offset) {
- SmallVector<DbgValueInst *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgValues(DbgUsers, AI, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgValues(AI, DPUsers);
// Replace any DbgVariableRecords that use this alloca.
for (DbgVariableRecord *DVR : DPUsers)
@@ -2002,11 +1996,9 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
/// Where possible to salvage debug information for \p I do so.
/// If not possible mark undef.
void llvm::salvageDebugInfo(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, &I, &DPUsers);
- assert(DbgUsers.empty());
- salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
+ findDbgUsers(&I, DPUsers);
+ salvageDebugInfoForDbgValues(I, DPUsers);
}
template <typename T> static void salvageDbgAssignAddress(T *Assign) {
@@ -2044,9 +2036,8 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
}
}
-void llvm::salvageDebugInfoForDbgValues(
- Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers,
- ArrayRef<DbgVariableRecord *> DPUsers) {
+void llvm::salvageDebugInfoForDbgValues(Instruction &I,
+ ArrayRef<DbgVariableRecord *> DPUsers) {
// These are arbitrary chosen limits on the maximum number of values and the
// maximum size of a debug expression we can salvage up to, used for
// performance reasons.
@@ -2054,9 +2045,6 @@ void llvm::salvageDebugInfoForDbgValues(
const unsigned MaxExpressionSize = 128;
bool Salvaged = false;
- // We should never see debug intrinsics nowadays.
- assert(DbgUsers.empty());
-
for (auto *DVR : DPUsers) {
if (DVR->isDbgAssign()) {
if (DVR->getAddress() == &I) {
@@ -2343,16 +2331,11 @@ static bool rewriteDebugUsers(
Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
// Find debug users of From.
- SmallVector<DbgVariableIntrinsic *, 1> Users;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(Users, &From, &DPUsers);
- if (Users.empty() && DPUsers.empty())
+ findDbgUsers(&From, DPUsers);
+ if (DPUsers.empty())
return false;
- // Ignore intrinsic-users: they are no longer supported and should never
- // appear.
- assert(Users.empty());
-
// Prevent use-before-def of To.
bool Changed = false;
@@ -3005,6 +2988,12 @@ static void combineMetadata(Instruction *K, const Instruction *J,
case LLVMContext::MD_memprof:
case LLVMContext::MD_callsite:
break;
+ case LLVMContext::MD_callee_type:
+ if (!AAOnly) {
+ K->setMetadata(LLVMContext::MD_callee_type,
+ MDNode::getMergedCalleeTypeMetadata(KMD, JMD));
+ }
+ break;
case LLVMContext::MD_align:
if (!AAOnly && (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef)))
K->setMetadata(
@@ -3350,10 +3339,8 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
}
void llvm::dropDebugUsers(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, &I, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(&I, DPUsers);
for (auto *DVR : DPUsers)
DVR->eraseFromParent();
}
@@ -3870,6 +3857,10 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
if (Op->isSwiftError())
return false;
+ // Cannot replace alloca argument with phi/select.
+ if (I->isLifetimeStartOrEnd())
+ return false;
+
// Early exit.
if (!isa<Constant, InlineAsm>(Op))
return true;
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 06115e0..7cc9ff8 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -158,10 +158,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
// intrinsics.
- SmallVector<DbgValueInst *, 1> DbgValues;
SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(OrigHeaderVal, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords) {
// The original users in the OrigHeader are already using the original
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 200d1fb..e7623aa 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
case RecurKind::UMin:
return Intrinsic::vector_reduce_umin;
case RecurKind::FMax:
+ case RecurKind::FMaxNum:
return Intrinsic::vector_reduce_fmax;
case RecurKind::FMin:
+ case RecurKind::FMinNum:
return Intrinsic::vector_reduce_fmin;
case RecurKind::FMaximum:
return Intrinsic::vector_reduce_fmaximum;
@@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
case RecurKind::SMax:
return Intrinsic::smax;
case RecurKind::FMin:
+ case RecurKind::FMinNum:
return Intrinsic::minnum;
case RecurKind::FMax:
+ case RecurKind::FMaxNum:
return Intrinsic::maxnum;
case RecurKind::FMinimum:
return Intrinsic::minimum;
@@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
Value *Right) {
Type *Ty = Left->getType();
if (Ty->isIntOrIntVectorTy() ||
- (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+ (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
+ RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
- // TODO: Add float minnum/maxnum support when FMF nnan is set.
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
"rdx.minmax");
@@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
case RecurKind::UMin:
case RecurKind::FMax:
case RecurKind::FMin:
+ case RecurKind::FMinNum:
+ case RecurKind::FMaxNum:
case RecurKind::FMinimum:
case RecurKind::FMaximum:
case RecurKind::FMinimumNum:
diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
index 8f55d7b..2743931 100644
--- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
@@ -319,9 +319,9 @@ void MemoryOpRemark::visitVariable(const Value *V,
// If we find some information in the debug info, take that.
bool FoundDI = false;
- // Try to get an llvm.dbg.declare, which has a DILocalVariable giving us the
+ // Try to get a dbg.declare, which has a DILocalVariable giving us the
// real debug info name and size of the variable.
- auto FindDI = [&](const auto *DVI) {
+ auto FindDI = [&](const DbgVariableRecord *DVI) {
if (DILocalVariable *DILV = DVI->getVariable()) {
std::optional<uint64_t> DISize = getSizeInBytes(DILV->getSizeInBits());
VariableInfo Var{DILV->getName(), DISize};
@@ -331,7 +331,6 @@ void MemoryOpRemark::visitVariable(const Value *V,
}
}
};
- for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI);
for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI);
if (FoundDI) {
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 73b5f48..d96f1d6 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -243,10 +243,8 @@ struct AllocaInfo {
OnlyUsedInOneBlock = false;
}
}
- SmallVector<DbgVariableIntrinsic *> AllDbgUsers;
SmallVector<DbgVariableRecord *> AllDPUsers;
- findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
- assert(AllDbgUsers.empty());
+ findDbgUsers(AI, AllDPUsers);
std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
std::back_inserter(DPUsers),
[](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 586874f..b9292af 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -19,7 +19,9 @@
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/NoFolder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
@@ -245,11 +247,43 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
const APInt *RHSC;
// Remove masking operations.
if (match(&Inst, m_And(m_Value(X), m_LowBitMask(RHSC)))) {
- ConstantRange LRange = GetRange(Inst.getOperand(0));
+ ConstantRange LRange = GetRange(X);
if (LRange.getUnsignedMax().ule(*RHSC))
return X;
}
+ // Check if we can simplify [us]cmp(X, Y) to X - Y.
+ if (auto *Cmp = dyn_cast<CmpIntrinsic>(&Inst)) {
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+ // Bail out on 1-bit comparisons.
+ if (BitWidth == 1)
+ return nullptr;
+ ConstantRange LRange = GetRange(LHS);
+ if (LRange.isSizeLargerThan(3))
+ return nullptr;
+ ConstantRange RRange = GetRange(RHS);
+ if (RRange.isSizeLargerThan(3))
+ return nullptr;
+ ConstantRange RHSLower = RRange.sub(APInt(BitWidth, 1));
+ ConstantRange RHSUpper = RRange.add(APInt(BitWidth, 1));
+ ICmpInst::Predicate Pred =
+ Cmp->isSigned() ? CmpInst::ICMP_SLE : CmpInst::ICMP_ULE;
+ if (!RHSLower.icmp(Pred, LRange) || !LRange.icmp(Pred, RHSUpper))
+ return nullptr;
+
+ IRBuilder<NoFolder> Builder(&Inst);
+ Value *Sub = Builder.CreateSub(LHS, RHS, Inst.getName(), /*HasNUW=*/false,
+ /*HasNSW=*/Cmp->isSigned());
+ InsertedValues.insert(Sub);
+ if (Sub->getType() != Inst.getType()) {
+ Sub = Builder.CreateSExtOrTrunc(Sub, Inst.getType());
+ InsertedValues.insert(Sub);
+ }
+ return Sub;
+ }
+
return nullptr;
}
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 561c898..49d0d95 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -197,10 +197,8 @@ void SSAUpdater::RewriteUse(Use &U) {
}
void SSAUpdater::UpdateDebugValues(Instruction *I) {
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(I, DbgVariableRecords);
for (auto &DVR : DbgVariableRecords) {
if (DVR->getParent() == I->getParent())
continue;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 75c9650..94b0ab8 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2227,16 +2227,6 @@ static bool canSinkInstructions(
return I->getOperand(OI) == I0->getOperand(OI);
};
if (!all_of(Insts, SameAsI0)) {
- // SROA can't speculate lifetime markers of selects/phis, and the
- // backend may handle such lifetimes incorrectly as well (#104776).
- // Don't sink lifetimes if it would introduce a phi on the pointer
- // argument.
- if (isa<LifetimeIntrinsic>(I0) && OI == 1 &&
- any_of(Insts, [](const Instruction *I) {
- return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
- }))
- return false;
-
if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
!canReplaceOperandWithVariable(I0, OI))
// We can't create a PHI from this GEP.
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 7ba95e2..8d8a60b 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -987,6 +987,13 @@ void Mapper::remapInstruction(Instruction *I) {
"Referenced value not in value map!");
}
+ // Drop callee_type metadata from calls that were remapped
+ // into a direct call from an indirect one.
+ if (auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->getMetadata(LLVMContext::MD_callee_type) && !CB->isIndirectCall())
+ CB->setMetadata(LLVMContext::MD_callee_type, nullptr);
+ }
+
// Remap phi nodes' incoming blocks.
if (PHINode *PN = dyn_cast<PHINode>(I)) {
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1185385..f57ce0c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -230,7 +230,6 @@ public:
/// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
/// and \p B.
- /// TODO: add createFCmp when needed.
VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
@@ -240,6 +239,17 @@ public:
new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
}
+ /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
+ /// and \p B.
+ VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+ DebugLoc DL = DebugLoc::getUnknown(),
+ const Twine &Name = "") {
+ assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
+ Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
+ return tryInsertInstruction(
+ new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+ }
+
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f142e07..46bc26c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1354,9 +1354,10 @@ public:
ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
ForceTailFoldingStyle.getValue()};
- if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+ if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
+ ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
return;
- // Override forced styles if needed.
+ // Override EVL styles if needed.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
@@ -4361,10 +4362,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const {
- // Cross iteration phis such as reductions need special handling and are
- // currently unsupported.
- if (any_of(OrigLoop->getHeader()->phis(),
- [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
+ // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
+ // reductions need special handling and are currently unsupported.
+ if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
+ if (!Legal->isReductionVariable(&Phi))
+ return Legal->isFixedOrderRecurrence(&Phi);
+ RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
+ return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+ }))
return false;
// Phis with uses outside of the loop require special handling and are
@@ -4475,6 +4480,28 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
+ if (MainLoopVF.isFixed()) {
+ // TODO: extend to support scalable VFs.
+ const SCEV *TC = vputils::getSCEVExprForVPValue(
+ getPlanFor(MainLoopVF).getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(TC) &&
+ "Trip count SCEV must be computable");
+ RemainingIterations = SE.getURemExpr(
+ TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+
+ // No iterations left to process in the epilogue.
+ if (RemainingIterations->isZero())
+ return Result;
+
+ MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
+ if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
+ SE.getConstant(TCType, MaxTripCount))) {
+ MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
+ }
+ LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
+ << MaxTripCount << "\n");
+ }
+
for (auto &NextVF : ProfitableVFs) {
// Skip candidate VFs without a corresponding VPlan.
if (!hasPlanWithVF(NextVF.Width))
@@ -4492,24 +4519,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If NextVF is greater than the number of remaining iterations, the
// epilogue loop would be dead. Skip such factors.
- if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
- // TODO: extend to support scalable VFs.
- if (!RemainingIterations) {
- const SCEV *TC = vputils::getSCEVExprForVPValue(
- getPlanFor(NextVF.Width).getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TC) &&
- "Trip count SCEV must be computable");
- RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
- MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
- if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
- SE.getConstant(TCType, MaxTripCount))) {
- MaxTripCount =
- SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
- }
- LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
- << MaxTripCount << "\n");
- }
+ if (RemainingIterations && !NextVF.Width.isScalable()) {
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
SE.getConstant(TCType, NextVF.Width.getFixedValue()),
@@ -8787,6 +8797,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Apply mandatory transformation to handle FP maxnum/minnum reduction with
+ // NaNs if possible, bail out otherwise.
+ if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
+ *Plan))
+ return nullptr;
+
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6ad5c601..0d0b342 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23202,6 +23202,8 @@ private:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FMaxNum:
+ case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
@@ -23339,6 +23341,8 @@ private:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FMaxNum:
+ case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
@@ -23441,6 +23445,8 @@ private:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FMaxNum:
+ case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 204268e..db40ce2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4188,13 +4188,11 @@ public:
return VPB;
}
- /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
- /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
- /// The returned block is owned by the VPlan and deleted once the VPlan is
- /// destroyed.
- VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
- bool IsReplicator = false) {
- auto *VPB = new VPRegionBlock(Name, IsReplicator);
+ /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
+ /// to nullptr. The returned block is owned by the VPlan and deleted once the
+ /// VPlan is destroyed.
+ VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(Name);
CreatedBlocks.push_back(VPB);
return VPB;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index ca8729a..3499e65 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return ResTy;
}
case Instruction::ICmp:
+ case Instruction::FCmp:
case VPInstruction::ActiveLaneMask:
assert(inferScalarType(R->getOperand(0)) ==
inferScalarType(R->getOperand(1)) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7fb5e82..194874a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -411,7 +411,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
// LatchExitVPB, taking care to preserve the original predecessor & successor
// order of blocks. Set region entry and exiting after both HeaderVPB and
// LatchVPBB have been disconnected from their predecessors/successors.
- auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/);
+ auto *R = Plan.createVPRegionBlock();
VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
VPBlockUtils::disconnectBlocks(LatchVPBB, R);
VPBlockUtils::connectBlocks(PreheaderVPBB, R);
@@ -652,3 +652,164 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
}
+
+bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
+ auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
+ auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
+ RedPhiR->getBackedgeValue()->getDefiningRecipe());
+ if (!MinMaxR)
+ return nullptr;
+
+ auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
+ if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+ !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
+ return nullptr;
+
+#ifndef NDEBUG
+ Intrinsic::ID RdxIntrinsicId =
+ RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
+ : Intrinsic::minnum;
+ assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+ cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
+ RdxIntrinsicId) ||
+ (RepR &&
+ cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
+ RdxIntrinsicId) &&
+ "Intrinsic did not match recurrence kind");
+#endif
+
+ if (MinMaxR->getOperand(0) == RedPhiR)
+ return MinMaxR->getOperand(1);
+
+ assert(MinMaxR->getOperand(1) == RedPhiR &&
+ "Reduction phi operand expected");
+ return MinMaxR->getOperand(0);
+ };
+
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPReductionPHIRecipe *RedPhiR = nullptr;
+ bool HasUnsupportedPhi = false;
+ for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
+ continue;
+ auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!Cur) {
+ // TODO: Also support fixed-order recurrence phis.
+ HasUnsupportedPhi = true;
+ continue;
+ }
+ // For now, only a single reduction is supported.
+ // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
+ if (RedPhiR)
+ return false;
+ if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
+ Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+ HasUnsupportedPhi = true;
+ continue;
+ }
+ RedPhiR = Cur;
+ }
+
+ if (!RedPhiR)
+ return true;
+
+ // We won't be able to resume execution in the scalar tail, if there are
+ // unsupported header phis or there is no scalar tail at all, due to
+ // tail-folding.
+ if (HasUnsupportedPhi || !Plan.hasScalarTail())
+ return false;
+
+ VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
+ if (!MinMaxOp)
+ return false;
+
+ RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
+ assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+ "unsupported reduction");
+ (void)RedPhiRK;
+
+ /// Check if the vector loop of \p Plan can early exit and restart
+ /// execution of last vector iteration in the scalar loop. This requires all
+ /// recipes up to early exit point be side-effect free as they are
+ /// re-executed. Currently we check that the loop is free of any recipe that
+ /// may write to memory. Expected to operate on an early VPlan w/o nested
+ /// regions.
+ for (VPBlockBase *VPB : vp_depth_first_shallow(
+ Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+ auto *VPBB = cast<VPBasicBlock>(VPB);
+ for (auto &R : *VPBB) {
+ if (R.mayWriteToMemory() &&
+ !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ return false;
+ }
+ }
+
+ VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+ assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+ "Unexpected terminator");
+ auto *IsLatchExitTaken =
+ Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+ LatchExitingBranch->getOperand(1));
+
+ VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
+ VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
+ auto *AnyExitTaken =
+ Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+ LatchExitingBranch->eraseFromParent();
+
+ // If we exit early due to NaNs, compute the final reduction result based on
+ // the reduction phi at the beginning of the last vector iteration.
+ auto *RdxResult = find_singleton<VPSingleDefRecipe>(
+ RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+ return VPI;
+ return nullptr;
+ });
+
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
+ auto *NewSel =
+ Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
+ RdxResult->setOperand(1, NewSel);
+
+ auto *ScalarPH = Plan.getScalarPreheader();
+ // Update resume phis for inductions in the scalar preheader. If AnyNaN is
+ // true, the resume from the start of the last vector iteration via the
+ // canonical IV, otherwise from the original value.
+ for (auto &R : ScalarPH->phis()) {
+ auto *ResumeR = cast<VPPhi>(&R);
+ VPValue *VecV = ResumeR->getOperand(0);
+ if (VecV == RdxResult)
+ continue;
+ if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
+ if (DerivedIV->getNumUsers() == 1 &&
+ DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
+ auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
+ &Plan.getVectorTripCount());
+ DerivedIV->moveAfter(&*Builder.getInsertPoint());
+ DerivedIV->setOperand(1, NewSel);
+ continue;
+ }
+ }
+ // Bail out and abandon the current, partially modified, VPlan if we
+ // encounter resume phi that cannot be updated yet.
+ if (VecV != &Plan.getVectorTripCount()) {
+ LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
+ "FMaxNum/FMinNum reduction.\n");
+ return false;
+ }
+ auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+ ResumeR->setOperand(0, NewSel);
+ }
+
+ auto *MiddleTerm = MiddleVPBB->getTerminator();
+ Builder.setInsertPoint(MiddleTerm);
+ VPValue *MiddleCond = MiddleTerm->getOperand(0);
+ VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
+ MiddleTerm->setOperand(0, NewCond);
+ return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1664bcc..1fbc3f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
return Builder.CreateFreeze(Op, Name);
}
+ case Instruction::FCmp:
case Instruction::ICmp: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Res = State.get(getOperand(0));
for (VPValue *Op : drop_begin(operands()))
Res = Builder.CreateOr(Res, State.get(Op));
- return Builder.CreateOrReduce(Res);
+ return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
}
case VPInstruction::FirstActiveLane: {
if (getNumOperands() == 1) {
@@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
switch (getOpcode()) {
case Instruction::ExtractElement:
case Instruction::Freeze:
+ case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::AnyOf:
@@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return Op == getOperand(1);
case Instruction::PHI:
return true;
+ case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
@@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
switch (getOpcode()) {
default:
return false;
+ case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
return vputils::onlyFirstPartUsed(this);
@@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
return Opcode == Instruction::ZExt;
break;
case OperationType::Cmp:
- return Opcode == Instruction::ICmp;
+ return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
case OperationType::Other:
return true;
}
@@ -3441,7 +3445,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
VPValue *BlockInMask = getMask();
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
- Value *PoisonVec = PoisonValue::get(VecTy);
auto CreateGroupMask = [&BlockInMask, &State,
&InterleaveFactor](Value *MaskForGaps) -> Value * {
@@ -3480,6 +3483,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Instruction *NewLoad;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = CreateGroupMask(MaskForGaps);
+ Value *PoisonVec = PoisonValue::get(VecTy);
NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
Group->getAlign(), GroupMask,
PoisonVec, "wide.masked.vec");
@@ -3489,57 +3493,39 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Group->addMetadata(NewLoad);
ArrayRef<VPValue *> VPDefs = definedValues();
- const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
- Value *Deinterleave = State.Builder.CreateIntrinsic(
+ NewLoad = State.Builder.CreateIntrinsic(
getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
+ }
- for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
- Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
- if (!Member) {
- // This value is not needed as it's not used
- cast<Instruction>(StridedVec)->eraseFromParent();
- continue;
- }
- // If this member has different type, cast the result type.
- if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
- StridedVec =
- createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
- }
-
- if (Group->isReverse())
- StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
-
- State.set(VPDefs[J], StridedVec);
- ++J;
- }
+ auto CreateStridedVector = [&InterleaveFactor, &State,
+ &NewLoad](unsigned Index) -> Value * {
+ assert(Index < InterleaveFactor && "Illegal group index");
+ if (State.VF.isScalable())
+ return State.Builder.CreateExtractValue(NewLoad, Index);
- return;
- }
- assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+ // For fixed length VF, use shuffle to extract the sub-vectors from the
+ // wide load.
+ auto StrideMask =
+ createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
+ return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
+ "strided.vec");
+ };
- // For each member in the group, shuffle out the appropriate data from the
- // wide loads.
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
// Skip the gaps in the group.
if (!Member)
continue;
- auto StrideMask =
- createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
- Value *StridedVec =
- State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
+ Value *StridedVec = CreateStridedVector(I);
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6..cb370fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1481,9 +1481,9 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
// (BranchOnCond true).
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
- if (all_of(
- Header->phis(),
- IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+ if (all_of(Header->phis(),
+ IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+ VPFirstOrderRecurrencePHIRecipe>)) {
for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
@@ -3275,10 +3275,13 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
}
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+ VPValue *PtrOp = WideLoad->getAddr();
+ if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
+ PtrOp = VecPtr->getOperand(0);
// Narrow wide load to uniform scalar load, as transformed VPlan will only
// process one original iteration.
- auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
- WideLoad->operands(), /*IsUniform*/ true,
+ auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
+ /*IsUniform*/ true,
/*Mask*/ nullptr, *WideLoad);
N->insertBefore(WideLoad);
return N;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 84a1247..ab189f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -103,6 +103,12 @@ struct VPlanTransforms {
/// not valid.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
+ /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
+ /// try to update the vector loop to exit early if any input is NaN and resume
+ /// executing in the scalar loop to handle the NaNs there. Return false if
+ /// this attempt was unsuccessful.
+ static bool handleMaxMinNumReductions(VPlan &Plan);
+
/// Clear NSW/NUW flags from reduction instructions if necessary.
static void clearReductionWrapFlags(VPlan &Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c..82adc34 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ private:
bool foldInsExtFNeg(Instruction &I);
bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
- bool foldBitOpOfBitcasts(Instruction &I);
+ bool foldBitOpOfCastops(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
@@ -808,48 +808,87 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
return true;
}
-bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
- // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
- Value *LHSSrc, *RHSSrc;
- if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)),
- m_BitCast(m_Value(RHSSrc)))))
+/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
+/// Supports: bitcast, trunc, sext, zext
+bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
+ // Check if this is a bitwise logic operation
+ auto *BinOp = dyn_cast<BinaryOperator>(&I);
+ if (!BinOp || !BinOp->isBitwiseLogicOp())
return false;
+ // Get the cast instructions
+ auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
+ auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
+ if (!LHSCast || !RHSCast) {
+ LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
+ return false;
+ }
+
+ // Both casts must be the same type
+ Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+ if (CastOpcode != RHSCast->getOpcode())
+ return false;
+
+ // Only handle supported cast operations
+ switch (CastOpcode) {
+ case Instruction::BitCast:
+ case Instruction::Trunc:
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ break;
+ default:
+ return false;
+ }
+
+ Value *LHSSrc = LHSCast->getOperand(0);
+ Value *RHSSrc = RHSCast->getOperand(0);
+
// Source types must match
if (LHSSrc->getType() != RHSSrc->getType())
return false;
- if (!LHSSrc->getType()->getScalarType()->isIntegerTy())
- return false;
- // Only handle vector types
+ // Only handle vector types with integer elements
auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
if (!SrcVecTy || !DstVecTy)
return false;
- // Same total bit width
- assert(SrcVecTy->getPrimitiveSizeInBits() ==
- DstVecTy->getPrimitiveSizeInBits() &&
- "Bitcast should preserve total bit width");
+ if (!SrcVecTy->getScalarType()->isIntegerTy() ||
+ !DstVecTy->getScalarType()->isIntegerTy())
+ return false;
// Cost Check :
- // OldCost = bitlogic + 2*bitcasts
- // NewCost = bitlogic + bitcast
- auto *BinOp = cast<BinaryOperator>(&I);
+ // OldCost = bitlogic + 2*casts
+ // NewCost = bitlogic + cast
+
+ // Calculate specific costs for each cast with instruction context
+ InstructionCost LHSCastCost =
+ TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+ TTI::CastContextHint::None, CostKind, LHSCast);
+ InstructionCost RHSCastCost =
+ TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+ TTI::CastContextHint::None, CostKind, RHSCast);
+
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(),
- TTI::CastContextHint::None) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(),
- TTI::CastContextHint::None);
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+ LHSCastCost + RHSCastCost;
+
+ // For new cost, we can't provide an instruction (it doesn't exist yet)
+ InstructionCost GenericCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None);
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+ GenericCastCost;
- LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I
- << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
- << "\n");
+ // Account for multi-use casts using specific costs
+ if (!LHSCast->hasOneUse())
+ NewCost += LHSCastCost;
+ if (!RHSCast->hasOneUse())
+ NewCost += RHSCastCost;
+
+ LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
+ << " NewCost=" << NewCost << "\n");
if (NewCost > OldCost)
return false;
@@ -862,8 +901,16 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
Worklist.pushValue(NewOp);
- // Bitcast the result back
- Value *Result = Builder.CreateBitCast(NewOp, I.getType());
+ // Create the cast operation directly to ensure we get a new instruction
+ Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+ // Preserve cast instruction flags
+ NewCast->copyIRFlags(LHSCast);
+ NewCast->andIRFlags(RHSCast);
+
+ // Insert the new instruction
+ Value *Result = Builder.Insert(NewCast);
+
replaceValue(I, *Result);
return true;
}
@@ -3773,7 +3820,7 @@ bool VectorCombine::run() {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- MadeChange |= foldBitOpOfBitcasts(I);
+ MadeChange |= foldBitOpOfCastops(I);
break;
default:
MadeChange |= shrinkType(I);