aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CallPrinter.cpp4
-rw-r--r--llvm/lib/Analysis/CaptureTracking.cpp8
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp28
-rw-r--r--llvm/lib/Analysis/DXILResource.cpp22
-rw-r--r--llvm/lib/Analysis/Delinearization.cpp200
-rw-r--r--llvm/lib/Analysis/DemandedBits.cpp69
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp271
-rw-r--r--llvm/lib/Analysis/IVDescriptors.cpp28
-rw-r--r--llvm/lib/Analysis/InlineCost.cpp8
-rw-r--r--llvm/lib/Analysis/LazyValueInfo.cpp9
-rw-r--r--llvm/lib/Analysis/Loads.cpp3
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp22
-rw-r--r--llvm/lib/Analysis/LoopInfo.cpp22
-rw-r--r--llvm/lib/Analysis/MemoryDependenceAnalysis.cpp10
-rw-r--r--llvm/lib/Analysis/MemoryLocation.cpp2
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp35
-rw-r--r--llvm/lib/Analysis/StackLifetime.cpp2
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp18
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp32
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp2
-rw-r--r--llvm/lib/BinaryFormat/DXContainer.cpp11
-rw-r--r--llvm/lib/BinaryFormat/MsgPackDocument.cpp7
-rw-r--r--llvm/lib/BinaryFormat/SFrame.cpp8
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp1
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp1
-rw-r--r--llvm/lib/CAS/BuiltinCAS.cpp94
-rw-r--r--llvm/lib/CAS/BuiltinCAS.h74
-rw-r--r--llvm/lib/CAS/CMakeLists.txt11
-rw-r--r--llvm/lib/CAS/InMemoryCAS.cpp326
-rw-r--r--llvm/lib/CAS/ObjectStore.cpp162
-rw-r--r--llvm/lib/CMakeLists.txt1
-rw-r--r--llvm/lib/CodeGen/Analysis.cpp52
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp1
-rw-r--r--llvm/lib/CodeGen/AtomicExpandPass.cpp13
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp12
-rw-r--r--llvm/lib/CodeGen/CallingConvLower.cpp21
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp15
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp468
-rw-r--r--llvm/lib/CodeGen/ExpandVectorPredication.cpp3
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp82
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp19
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp17
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp2
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp136
-rw-r--r--llvm/lib/CodeGen/LiveVariables.cpp37
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIRParser.cpp42
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp31
-rw-r--r--llvm/lib/CodeGen/MachineCopyPropagation.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineDebugify.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineFrameInfo.cpp16
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp14
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp25
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp6
-rw-r--r--llvm/lib/CodeGen/MacroFusion.cpp2
-rw-r--r--llvm/lib/CodeGen/PHIElimination.cpp22
-rw-r--r--llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp4
-rw-r--r--llvm/lib/CodeGen/PrologEpilogInserter.cpp29
-rw-r--r--llvm/lib/CodeGen/RegAllocFast.cpp7
-rw-r--r--llvm/lib/CodeGen/RegisterPressure.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp111
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/FastISel.cpp35
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp33
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp16
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp36
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp12
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp262
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp340
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp18
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp38
-rw-r--r--llvm/lib/CodeGen/ShrinkWrap.cpp10
-rw-r--r--llvm/lib/CodeGen/SwiftErrorValueTracking.cpp2
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp15
-rw-r--r--llvm/lib/CodeGen/VirtRegMap.cpp2
-rw-r--r--llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp6
-rw-r--r--llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp10
-rw-r--r--llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp12
-rw-r--r--llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp93
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp7
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLBinding.cpp2
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp11
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp9
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp2
-rw-r--r--llvm/lib/Frontend/OpenMP/OMP.cpp2
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp22
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp28
-rw-r--r--llvm/lib/IR/ConstantFold.cpp1
-rw-r--r--llvm/lib/IR/ConstantRange.cpp42
-rw-r--r--llvm/lib/IR/Constants.cpp20
-rw-r--r--llvm/lib/IR/Core.cpp10
-rw-r--r--llvm/lib/IR/DebugInfo.cpp69
-rw-r--r--llvm/lib/IR/DebugInfoMetadata.cpp10
-rw-r--r--llvm/lib/IR/Globals.cpp1
-rw-r--r--llvm/lib/IR/IRBuilder.cpp20
-rw-r--r--llvm/lib/IR/Instruction.cpp23
-rw-r--r--llvm/lib/IR/Instructions.cpp52
-rw-r--r--llvm/lib/IR/LLVMContextImpl.h27
-rw-r--r--llvm/lib/IR/ProfDataUtils.cpp12
-rw-r--r--llvm/lib/IR/RuntimeLibcalls.cpp130
-rw-r--r--llvm/lib/IR/Value.cpp46
-rw-r--r--llvm/lib/IR/Verifier.cpp67
-rw-r--r--llvm/lib/LTO/LTO.cpp2
-rw-r--r--llvm/lib/Linker/IRMover.cpp7
-rw-r--r--llvm/lib/MC/CMakeLists.txt1
-rw-r--r--llvm/lib/MC/ELFObjectWriter.cpp8
-rw-r--r--llvm/lib/MC/GOFFObjectWriter.cpp1
-rw-r--r--llvm/lib/MC/MCAsmInfoGOFF.cpp4
-rw-r--r--llvm/lib/MC/MCAsmStreamer.cpp1
-rw-r--r--llvm/lib/MC/MCContext.cpp1
-rw-r--r--llvm/lib/MC/MCELFStreamer.cpp1
-rw-r--r--llvm/lib/MC/MCFragment.cpp2
-rw-r--r--llvm/lib/MC/MCObjectFileInfo.cpp19
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp25
-rw-r--r--llvm/lib/MC/MCParser/COFFMasmParser.cpp1
-rw-r--r--llvm/lib/MC/MCParser/ELFAsmParser.cpp1
-rw-r--r--llvm/lib/MC/MCParser/WasmAsmParser.cpp1
-rw-r--r--llvm/lib/MC/MCSFrame.cpp98
-rw-r--r--llvm/lib/MC/MCSection.cpp2
-rw-r--r--llvm/lib/MC/MCSymbol.cpp15
-rw-r--r--llvm/lib/MC/MCWasmStreamer.cpp1
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp1
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp1
-rw-r--r--llvm/lib/MC/XCOFFObjectWriter.cpp1
-rw-r--r--llvm/lib/MCA/Instruction.cpp7
-rw-r--r--llvm/lib/Object/IRSymtab.cpp47
-rw-r--r--llvm/lib/Object/SFrameParser.cpp150
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp2
-rw-r--r--llvm/lib/ProfileData/InstrProf.cpp4
-rw-r--r--llvm/lib/ProfileData/InstrProfWriter.cpp64
-rw-r--r--llvm/lib/SandboxIR/Context.cpp1
-rw-r--r--llvm/lib/SandboxIR/Instruction.cpp3
-rw-r--r--llvm/lib/Support/APFloat.cpp255
-rw-r--r--llvm/lib/Support/APInt.cpp18
-rw-r--r--llvm/lib/Support/CMakeLists.txt1
-rw-r--r--llvm/lib/Support/DXILABI.cpp33
-rw-r--r--llvm/lib/Support/KnownBits.cpp18
-rw-r--r--llvm/lib/Support/MemoryBuffer.cpp10
-rw-r--r--llvm/lib/Support/SmallPtrSet.cpp100
-rw-r--r--llvm/lib/Support/Unix/Path.inc6
-rw-r--r--llvm/lib/Support/Windows/Path.inc8
-rw-r--r--llvm/lib/TableGen/Record.cpp86
-rw-r--r--llvm/lib/Target/AArch64/AArch64.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp27
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.h45
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp79
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp42
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp29
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp352
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h18
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td97
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp345
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td59
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h40
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td37
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td28
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA320.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA510.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp27
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp25
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp212
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h18
-rw-r--r--llvm/lib/Target/AArch64/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp30
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp10
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp12
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp16
-rw-r--r--llvm/lib/Target/AArch64/MachineSMEABIPass.cpp696
-rw-r--r--llvm/lib/Target/AArch64/SMEABIPass.cpp31
-rw-r--r--llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp12
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td12
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp48
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFeatures.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp40
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp66
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp80
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h9
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h42
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h22
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp122
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp62
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h15
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td44
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h18
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td13
-rw-r--r--llvm/lib/Target/ARM/ARMCallingConv.h21
-rw-r--r--llvm/lib/Target/ARM/ARMConstantIslandPass.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMFastISel.cpp14
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp340
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h7
-rw-r--r--llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp23
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp44
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h5
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp4
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp2
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp5
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.h2
-rw-r--r--llvm/lib/Target/AVR/AVRTargetMachine.cpp12
-rw-r--r--llvm/lib/Target/AVR/AVRTargetMachine.h2
-rw-r--r--llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp24
-rw-r--r--llvm/lib/Target/AVR/AVRTargetTransformInfo.h51
-rw-r--r--llvm/lib/Target/AVR/CMakeLists.txt4
-rw-r--r--llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp2
-rw-r--r--llvm/lib/Target/CSKY/CSKYISelLowering.cpp5
-rw-r--r--llvm/lib/Target/DirectX/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td10
-rw-r--r--llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp8
-rw-r--r--llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp24
-rw-r--r--llvm/lib/Target/DirectX/DXILOpLowering.cpp5
-rw-r--r--llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp182
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.h8
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp98
-rw-r--r--llvm/lib/Target/DirectX/DirectXPassRegistry.def1
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCallingConv.td9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenInsert.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.cpp25
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp12
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.cpp10
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.h3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp12
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp1
-rw-r--r--llvm/lib/Target/Lanai/LanaiISelLowering.cpp5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp78
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td18
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp49
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h2
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp15
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h2
-rw-r--r--llvm/lib/Target/M68k/M68kISelLowering.cpp9
-rw-r--r--llvm/lib/Target/M68k/M68kInstrArithmetic.td2
-rw-r--r--llvm/lib/Target/M68k/M68kInstrData.td4
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelLowering.cpp7
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp1
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp1
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.cpp173
-rw-r--r--llvm/lib/Target/Mips/MipsCCState.h171
-rw-r--r--llvm/lib/Target/Mips/MipsCallLowering.cpp75
-rw-r--r--llvm/lib/Target/Mips/MipsCallingConv.td12
-rw-r--r--llvm/lib/Target/Mips/MipsConstantIslandPass.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsFastISel.cpp18
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp46
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp14
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp175
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp36
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td174
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td746
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp30
-rw-r--r--llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCCallingConv.h17
-rw-r--r--llvm/lib/Target/PowerPC/PPCFastISel.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/PPCFrameLowering.cpp6
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp38
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h14
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrAltivec.td6
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td29
-rw-r--r--llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp17
-rw-r--r--llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp17
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp12
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp3
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp15
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp42
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h10
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp2
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp29
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp136
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp304
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td548
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoV.td9
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td139
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp43
-rw-r--r--llvm/lib/Target/RISCV/RISCVMoveMerger.cpp100
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedAndes45.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td37
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp1
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp21
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp102
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp26
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h8
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp23
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.h6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp71
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp29
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td456
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp29
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.h7
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrUAOSA.td12
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.h44
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp1
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp9
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp14
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/TargetLoweringObjectFile.cpp8
-rw-r--r--llvm/lib/Target/VE/VEISelLowering.cpp11
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp66
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp1
-rw-r--r--llvm/lib/Target/X86/GISel/X86CallLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp24
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.h5
-rw-r--r--llvm/lib/Target/X86/X86.td4
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.h4
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp4
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp11
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp44
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h4
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp4
-rw-r--r--llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp2
-rw-r--r--llvm/lib/Target/X86/X86PreTileConfig.cpp2
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp44
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td6
-rw-r--r--llvm/lib/Target/X86/X86ScheduleAtom.td9
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp56
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/X86/X86WinEHUnwindV2.cpp76
-rw-r--r--llvm/lib/Target/XCore/XCoreISelLowering.cpp16
-rw-r--r--llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp9
-rw-r--r--llvm/lib/Target/Xtensa/Xtensa.td4
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.cpp3
-rw-r--r--llvm/lib/Target/Xtensa/XtensaProcessors.td27
-rw-r--r--llvm/lib/TargetParser/CMakeLists.txt1
-rw-r--r--llvm/lib/TargetParser/Host.cpp12
-rw-r--r--llvm/lib/TargetParser/XtensaTargetParser.cpp93
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp2
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp4
-rw-r--r--llvm/lib/Transforms/Coroutines/SpillUtils.cpp5
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp22
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp32
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp19
-rw-r--r--llvm/lib/Transforms/IPO/GlobalDCE.cpp31
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/Inliner.cpp7
-rw-r--r--llvm/lib/Transforms/IPO/ModuleInliner.cpp8
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp80
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp54
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp11
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp38
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h1
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp20
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp16
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp445
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp16
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp50
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp99
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/GuardWidening.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp74
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp50
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDistribute.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp10
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp52
-rw-r--r--llvm/lib/Transforms/Scalar/Reassociate.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/SCCP.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp56
-rw-r--r--llvm/lib/Transforms/Scalar/Scalar.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp85
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp11
-rw-r--r--llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/ControlFlowUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/InlineFunction.cpp31
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp70
-rw-r--r--llvm/lib/Transforms/Utils/ProfileVerify.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp22
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp34
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp81
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp23
-rw-r--r--llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp504
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h28
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp382
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp1375
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp46
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h63
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp92
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h207
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp11
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp89
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp387
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h65
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp68
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp496
527 files changed, 14248 insertions, 6936 deletions
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp
index 672dae1..99d8b11 100644
--- a/llvm/lib/Analysis/CallPrinter.cpp
+++ b/llvm/lib/Analysis/CallPrinter.cpp
@@ -70,7 +70,7 @@ public:
for (Function &F : M->getFunctionList()) {
uint64_t localSumFreq = 0;
- SmallSet<Function *, 16> Callers;
+ SmallPtrSet<Function *, 16> Callers;
for (User *U : F.users())
if (isa<CallInst>(U))
Callers.insert(cast<Instruction>(U)->getFunction());
@@ -99,7 +99,7 @@ private:
bool FoundParallelEdge = true;
while (FoundParallelEdge) {
- SmallSet<Function *, 16> Visited;
+ SmallPtrSet<Function *, 16> Visited;
FoundParallelEdge = false;
for (auto CI = Node->begin(), CE = Node->end(); CI != CE; CI++) {
if (!(Visited.insert(CI->second->getFunction())).second) {
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 076f417..b6acda3 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -359,6 +359,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
case Instruction::AddrSpaceCast:
// The original value is not captured via this if the new value isn't.
return UseCaptureInfo::passthrough();
+ case Instruction::PtrToAddr:
+ // We treat ptrtoaddr as a location-independent capture of the address even
+ // if it is ultimately not used. Continuing recursive analysis after
+ // ptrtoaddr would be possible, but we'd need logic to do that correctly,
+ // which is not the same as the current pointer following logic.
+ return CaptureComponents::Address;
case Instruction::ICmp: {
unsigned Idx = U.getOperandNo();
unsigned OtherIdx = 1 - Idx;
@@ -399,7 +405,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
SmallVector<const Use *, 20> Worklist;
Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking());
- SmallSet<const Use *, 20> Visited;
+ SmallPtrSet<const Use *, 20> Visited;
auto AddUses = [&](const Value *V) {
for (const Use &U : V->uses()) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index dd98b62..f44937a 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
switch (Opcode) {
default:
llvm_unreachable("Missing case");
+ case Instruction::PtrToAddr:
+ // TODO: Add some of the ptrtoint folds here as well.
+ break;
case Instruction::PtrToInt:
if (auto *CE = dyn_cast<ConstantExpr>(C)) {
Constant *FoldedValue = nullptr;
@@ -1659,6 +1662,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::aarch64_sve_convert_from_svbool:
case Intrinsic::wasm_alltrue:
case Intrinsic::wasm_anytrue:
+ case Intrinsic::wasm_dot:
// WebAssembly float semantics are always known
case Intrinsic::wasm_trunc_signed:
case Intrinsic::wasm_trunc_unsigned:
@@ -3989,6 +3993,30 @@ static Constant *ConstantFoldFixedVectorCall(
}
return ConstantVector::get(Result);
}
+ case Intrinsic::wasm_dot: {
+ unsigned NumElements =
+ cast<FixedVectorType>(Operands[0]->getType())->getNumElements();
+
+ assert(NumElements == 8 && Result.size() == 4 &&
+ "wasm dot takes i16x8 and produces i32x4");
+ assert(Ty->isIntegerTy());
+ int32_t MulVector[8];
+
+ for (unsigned I = 0; I < NumElements; ++I) {
+ ConstantInt *Elt0 =
+ cast<ConstantInt>(Operands[0]->getAggregateElement(I));
+ ConstantInt *Elt1 =
+ cast<ConstantInt>(Operands[1]->getAggregateElement(I));
+
+ MulVector[I] = Elt0->getSExtValue() * Elt1->getSExtValue();
+ }
+ for (unsigned I = 0; I < Result.size(); I++) {
+ int64_t IAdd = (int64_t)MulVector[I * 2] + (int64_t)MulVector[I * 2 + 1];
+ Result[I] = ConstantInt::get(Ty, IAdd);
+ }
+
+ return ConstantVector::get(Result);
+ }
default:
break;
}
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 629fa7cd..3a70666 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <optional>
@@ -29,20 +30,6 @@
using namespace llvm;
using namespace dxil;
-static StringRef getResourceClassName(ResourceClass RC) {
- switch (RC) {
- case ResourceClass::SRV:
- return "SRV";
- case ResourceClass::UAV:
- return "UAV";
- case ResourceClass::CBuffer:
- return "CBuffer";
- case ResourceClass::Sampler:
- return "Sampler";
- }
- llvm_unreachable("Unhandled ResourceClass");
-}
-
static StringRef getResourceKindName(ResourceKind RK) {
switch (RK) {
case ResourceKind::Texture1D:
@@ -612,7 +599,12 @@ void ResourceTypeInfo::print(raw_ostream &OS, const DataLayout &DL) const {
GlobalVariable *ResourceInfo::createSymbol(Module &M, StructType *Ty) {
assert(!Symbol && "Symbol has already been created");
- Symbol = new GlobalVariable(M, Ty, /*isConstant=*/true,
+ Type *ResTy = Ty;
+ int64_t Size = Binding.Size;
+ if (Size != 1)
+ // unbounded arrays are represented as zero-sized arrays in LLVM IR
+ ResTy = ArrayType::get(Ty, Size == ~0u ? 0 : Size);
+ Symbol = new GlobalVariable(M, ResTy, /*isConstant=*/true,
GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr, Name);
return Symbol;
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 329bd35..761c566 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -24,6 +24,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -32,6 +33,11 @@ using namespace llvm;
#define DL_NAME "delinearize"
#define DEBUG_TYPE DL_NAME
+static cl::opt<bool> UseFixedSizeArrayHeuristic(
+ "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden,
+ cl::desc("When printing analysis, use the heuristic for fixed-size arrays "
+ "if the default delinearizetion fails."));
+
// Return true when S contains at least an undef value.
static inline bool containsUndefs(const SCEV *S) {
return SCEVExprContains(S, [](const SCEV *S) {
@@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
});
}
+static std::optional<APInt> tryIntoAPInt(const SCEV *S) {
+ if (const auto *Const = dyn_cast<SCEVConstant>(S))
+ return Const->getAPInt();
+ return std::nullopt;
+}
+
+/// Collects the absolute values of constant steps for all induction variables.
+/// Returns true if we can prove that all step recurrences are constants and \p
+/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p
+/// Steps after divided by \p ElementSize.
+static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Steps,
+ uint64_t ElementSize) {
+ // End of recursion. The constant value also must be a multiple of
+ // ElementSize.
+ if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) {
+ const uint64_t Mod = Const->getAPInt().urem(ElementSize);
+ return Mod == 0;
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+ if (!AR || !AR->isAffine())
+ return false;
+
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ std::optional<APInt> StepAPInt = tryIntoAPInt(Step);
+ if (!StepAPInt)
+ return false;
+
+ APInt Q;
+ uint64_t R;
+ APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R);
+ if (R != 0)
+ return false;
+
+ // Bail out when the step is too large.
+ std::optional<uint64_t> StepVal = Q.tryZExtValue();
+ if (!StepVal)
+ return false;
+
+ Steps.push_back(*StepVal);
+ return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize);
+}
+
+bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Sizes,
+ const SCEV *ElementSize) {
+ if (!ElementSize)
+ return false;
+
+ std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize);
+ if (!ElementSizeAPInt || *ElementSizeAPInt == 0)
+ return false;
+
+ std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue();
+
+ // Early exit when ElementSize is not a positive constant.
+ if (!ElementSizeConst)
+ return false;
+
+ if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) ||
+ Sizes.empty()) {
+ Sizes.clear();
+ return false;
+ }
+
+ // At this point, Sizes contains the absolute step recurrences for all
+ // induction variables. Each step recurrence must be a multiple of the size of
+ // the array element. Assuming that the each value represents the size of an
+ // array for each dimension, attempts to restore the length of each dimension
+ // by dividing the step recurrence by the next smaller value. For example, if
+ // we have the following AddRec SCEV:
+ //
+ // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+ //
+ // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of
+ // the outermost dimension, the next dimension will be computed as 256 / 32 =
+ // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results
+ // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is
+ // a base pointer.
+ //
+ // TODO: Catch more cases, e.g., when a step recurrence is not divisible by
+ // the next smaller one, like A[i][3*j].
+ llvm::sort(Sizes.rbegin(), Sizes.rend());
+ Sizes.erase(llvm::unique(Sizes), Sizes.end());
+
+ // The last element in Sizes should be ElementSize. At this point, all values
+ // in Sizes are assumed to be divided by ElementSize, so replace it with 1.
+ assert(Sizes.back() != 0 && "Unexpected zero size in Sizes.");
+ Sizes.back() = 1;
+
+ for (unsigned I = 0; I + 1 < Sizes.size(); I++) {
+ uint64_t PrevSize = Sizes[I + 1];
+ if (Sizes[I] % PrevSize) {
+ Sizes.clear();
+ return false;
+ }
+ Sizes[I] /= PrevSize;
+ }
+
+ // Finally, the last element in Sizes should be ElementSize.
+ Sizes.back() = *ElementSizeConst;
+ return true;
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access, assuming that the array is a fixed size array.
+///
+/// E.g., if we have the code like as follows:
+///
+/// double A[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+///
+/// The access function will be represented as an AddRec SCEV like:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// Then findFixedSizeArrayDimensions infers the size of each dimension of the
+/// array based on the fact that the value of the step recurrence is a multiple
+/// of the size of the corresponding array element. In the above example, it
+/// results in the following:
+///
+/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes.
+///
+/// Finally each subscript will be computed as follows:
+///
+/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// Note that this function doesn't check the range of possible values for each
+/// subscript, so the caller should perform additional boundary checks if
+/// necessary.
+///
+/// Also note that this function doesn't guarantee that the original array size
+/// is restored "correctly". For example, in the following case:
+///
+/// double A[42][4][64];
+/// double B[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+/// use B[i][2*j][k]
+///
+/// The access function for both accesses will be the same:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// The array sizes for both A and B will be computed as
+/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B.
+///
+/// TODO: At the moment, this function can handle only simple cases. For
+/// example, we cannot handle a case where a step recurrence is not divisible
+/// by the next smaller step recurrence, e.g., A[i][3*j].
+bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<const SCEV *> &Subscripts,
+ SmallVectorImpl<const SCEV *> &Sizes,
+ const SCEV *ElementSize) {
+
+ // First step: find the fixed array size.
+ SmallVector<uint64_t, 4> ConstSizes;
+ if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) {
+ Sizes.clear();
+ return false;
+ }
+
+ // Convert the constant size to SCEV.
+ for (uint64_t Size : ConstSizes)
+ Sizes.push_back(SE.getConstant(Expr->getType(), Size));
+
+ // Second step: compute the access functions for each subscript.
+ computeAccessFunctions(SE, Expr, Subscripts, Sizes);
+
+ return !Subscripts.empty();
+}
+
bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
const GetElementPtrInst *GEP,
SmallVectorImpl<const SCEV *> &Subscripts,
@@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
O << "AccessFunction: " << *AccessFn << "\n";
SmallVector<const SCEV *, 3> Subscripts, Sizes;
+
+ auto IsDelinearizationFailed = [&]() {
+ return Subscripts.size() == 0 || Sizes.size() == 0 ||
+ Subscripts.size() != Sizes.size();
+ };
+
delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
- if (Subscripts.size() == 0 || Sizes.size() == 0 ||
- Subscripts.size() != Sizes.size()) {
+ if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) {
+ Subscripts.clear();
+ Sizes.clear();
+ delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes,
+ SE->getElementSize(&Inst));
+ }
+
+ if (IsDelinearizationFailed()) {
O << "failed to delinearize\n";
continue;
}
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index 6694d5c..e088175 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -76,6 +76,26 @@ void DemandedBits::determineLiveOperandBits(
computeKnownBits(V2, Known2, DL, &AC, UserI, &DT);
}
};
+ auto GetShiftedRange = [&](uint64_t Min, uint64_t Max, bool ShiftLeft) {
+ auto ShiftF = [ShiftLeft](const APInt &Mask, unsigned ShiftAmnt) {
+ return ShiftLeft ? Mask.shl(ShiftAmnt) : Mask.lshr(ShiftAmnt);
+ };
+ AB = APInt::getZero(BitWidth);
+ uint64_t LoopRange = Max - Min;
+ APInt Mask = AOut;
+ APInt Shifted = AOut; // AOut | (AOut << 1) | ... | (AOut << (ShiftAmnt - 1)
+ for (unsigned ShiftAmnt = 1; ShiftAmnt <= LoopRange; ShiftAmnt <<= 1) {
+ if (LoopRange & ShiftAmnt) {
+ // Account for (LoopRange - ShiftAmnt, LoopRange]
+ Mask |= ShiftF(Shifted, LoopRange - ShiftAmnt + 1);
+ // Clears the low bit.
+ LoopRange -= ShiftAmnt;
+ }
+ // [0, ShiftAmnt) -> [0, ShiftAmnt * 2)
+ Shifted |= ShiftF(Shifted, ShiftAmnt);
+ }
+ AB = ShiftF(Mask, Min);
+ };
switch (UserI->getOpcode()) {
default: break;
@@ -183,6 +203,17 @@ void DemandedBits::determineLiveOperandBits(
AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
else if (S->hasNoUnsignedWrap())
AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+ } else {
+ ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+ uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+ uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+ // similar to Lshr case
+ GetShiftedRange(Min, Max, /*ShiftLeft=*/false);
+ const auto *S = cast<ShlOperator>(UserI);
+ if (S->hasNoSignedWrap())
+ AB |= APInt::getHighBitsSet(BitWidth, Max + 1);
+ else if (S->hasNoUnsignedWrap())
+ AB |= APInt::getHighBitsSet(BitWidth, Max);
}
}
break;
@@ -197,6 +228,24 @@ void DemandedBits::determineLiveOperandBits(
// (they must be zero).
if (cast<LShrOperator>(UserI)->isExact())
AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+ } else {
+ ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+ uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+ uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+ // Suppose AOut == 0b0000 0001
+ // [min, max] = [1, 3]
+ // iteration 1 shift by 1 mask is 0b0000 0011
+ // iteration 2 shift by 2 mask is 0b0000 1111
+ // iteration 3, shiftAmnt = 4 > max - min, we stop.
+ //
+ // After the iterations we need one more shift by min,
+ // to move from 0b0000 1111 to --> 0b0001 1110.
+ // The loop populates the mask relative to (0,...,max-min),
+ // but we need coverage from (min, max).
+ // This is why the shift by min is needed.
+ GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
+ if (cast<LShrOperator>(UserI)->isExact())
+ AB |= APInt::getLowBitsSet(BitWidth, Max);
}
}
break;
@@ -217,6 +266,26 @@ void DemandedBits::determineLiveOperandBits(
// (they must be zero).
if (cast<AShrOperator>(UserI)->isExact())
AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+ } else {
+ ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+ uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+ uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+ GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
+ if (Max &&
+ (AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue()) {
+ // Suppose AOut = 0011 1100
+ // [min, max] = [1, 3]
+ // ShiftAmount = 1 : Mask is 1000 0000
+ // ShiftAmount = 2 : Mask is 1100 0000
+ // ShiftAmount = 3 : Mask is 1110 0000
+ // The Mask with Max covers every case in [min, max],
+ // so we are done
+ AB.setSignBit();
+ }
+ // If the shift is exact, then the low bits are not dead
+ // (they must be zero).
+ if (cast<AShrOperator>(UserI)->isExact())
+ AB |= APInt::getLowBitsSet(BitWidth, Max);
}
}
break;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 835e270..f33e04e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1531,6 +1531,62 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
return Q;
}
+/// Given an affine expression of the form A*k + B, where k is an arbitrary
+/// integer, infer the possible range of k based on the known range of the
+/// affine expression. If we know A*k + B is non-negative, i.e.,
+///
+/// A*k + B >= 0
+///
+/// we can derive the following inequalities for k when A is positive:
+///
+/// k >= -B / A
+///
+/// Since k is an integer, it means k is greater than or equal to the
+/// ceil(-B / A).
+///
+/// If the upper bound of the affine expression \p UB is passed, the following
+/// inequality can be derived as well:
+///
+/// A*k + B <= UB
+///
+/// which leads to:
+///
+/// k <= (UB - B) / A
+///
+/// Again, as k is an integer, it means k is less than or equal to the
+/// floor((UB - B) / A).
+///
+/// The similar logic applies when A is negative, but the inequalities sign flip
+/// while working with them.
+///
+/// Preconditions: \p A is non-zero, and we know A*k + B is non-negative.
+static std::pair<std::optional<APInt>, std::optional<APInt>>
+inferDomainOfAffine(const APInt &A, const APInt &B,
+ const std::optional<APInt> &UB) {
+ assert(A != 0 && "A must be non-zero");
+ std::optional<APInt> TL, TU;
+ if (A.sgt(0)) {
+ TL = ceilingOfQuotient(-B, A);
+ LLVM_DEBUG(dbgs() << "\t Possible TL = " << *TL << "\n");
+ // New bound check - modification to Banerjee's e3 check
+ if (UB) {
+ // TODO?: Overflow check for UB - B
+ TU = floorOfQuotient(*UB - B, A);
+ LLVM_DEBUG(dbgs() << "\t Possible TU = " << *TU << "\n");
+ }
+ } else {
+ TU = floorOfQuotient(-B, A);
+ LLVM_DEBUG(dbgs() << "\t Possible TU = " << *TU << "\n");
+ // New bound check - modification to Banerjee's e3 check
+ if (UB) {
+ // TODO?: Overflow check for UB - B
+ TL = ceilingOfQuotient(*UB - B, A);
+ LLVM_DEBUG(dbgs() << "\t Possible TL = " << *TL << "\n");
+ }
+ }
+ return std::make_pair(TL, TU);
+}
+
// exactSIVtest -
// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i],
// where i is an induction variable, c1 and c2 are loop invariant, and a1
@@ -1590,14 +1646,12 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
LLVM_DEBUG(dbgs() << "\t X = " << X << ", Y = " << Y << "\n");
// since SCEV construction normalizes, LM = 0
- APInt UM(Bits, 1, true);
- bool UMValid = false;
+ std::optional<APInt> UM;
// UM is perhaps unavailable, let's check
if (const SCEVConstant *CUB =
collectConstantUpperBound(CurLoop, Delta->getType())) {
UM = CUB->getAPInt();
- LLVM_DEBUG(dbgs() << "\t UM = " << UM << "\n");
- UMValid = true;
+ LLVM_DEBUG(dbgs() << "\t UM = " << *UM << "\n");
}
APInt TU(APInt::getSignedMaxValue(Bits));
@@ -1609,44 +1663,33 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
LLVM_DEBUG(dbgs() << "\t TX = " << TX << "\n");
LLVM_DEBUG(dbgs() << "\t TY = " << TY << "\n");
- SmallVector<APInt, 2> TLVec, TUVec;
APInt TB = BM.sdiv(G);
- if (TB.sgt(0)) {
- TLVec.push_back(ceilingOfQuotient(-TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- // New bound check - modification to Banerjee's e3 check
- if (UMValid) {
- TUVec.push_back(floorOfQuotient(UM - TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- }
- } else {
- TUVec.push_back(floorOfQuotient(-TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- // New bound check - modification to Banerjee's e3 check
- if (UMValid) {
- TLVec.push_back(ceilingOfQuotient(UM - TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- }
- }
-
APInt TA = AM.sdiv(G);
- if (TA.sgt(0)) {
- if (UMValid) {
- TUVec.push_back(floorOfQuotient(UM - TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- }
- // New bound check - modification to Banerjee's e3 check
- TLVec.push_back(ceilingOfQuotient(-TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- } else {
- if (UMValid) {
- TLVec.push_back(ceilingOfQuotient(UM - TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- }
- // New bound check - modification to Banerjee's e3 check
- TUVec.push_back(floorOfQuotient(-TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- }
+
+ // At this point, we have the following equations:
+ //
+ // TA*i0 - TB*i1 = TC
+ //
+ // Also, we know that the all pairs of (i0, i1) can be expressed as:
+ //
+ // (TX + k*TB, TY + k*TA)
+ //
+ // where k is an arbitrary integer.
+ auto [TL0, TU0] = inferDomainOfAffine(TB, TX, UM);
+ auto [TL1, TU1] = inferDomainOfAffine(TA, TY, UM);
+
+ auto CreateVec = [](const std::optional<APInt> &V0,
+ const std::optional<APInt> &V1) {
+ SmallVector<APInt, 2> Vec;
+ if (V0)
+ Vec.push_back(*V0);
+ if (V1)
+ Vec.push_back(*V1);
+ return Vec;
+ };
+
+ SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1);
+ SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1);
LLVM_DEBUG(dbgs() << "\t TA = " << TA << "\n");
LLVM_DEBUG(dbgs() << "\t TB = " << TB << "\n");
@@ -1967,24 +2010,20 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
LLVM_DEBUG(dbgs() << "\t X = " << X << ", Y = " << Y << "\n");
// since SCEV construction seems to normalize, LM = 0
- APInt SrcUM(Bits, 1, true);
- bool SrcUMvalid = false;
+ std::optional<APInt> SrcUM;
// SrcUM is perhaps unavailable, let's check
if (const SCEVConstant *UpperBound =
collectConstantUpperBound(SrcLoop, Delta->getType())) {
SrcUM = UpperBound->getAPInt();
- LLVM_DEBUG(dbgs() << "\t SrcUM = " << SrcUM << "\n");
- SrcUMvalid = true;
+ LLVM_DEBUG(dbgs() << "\t SrcUM = " << *SrcUM << "\n");
}
- APInt DstUM(Bits, 1, true);
- bool DstUMvalid = false;
+ std::optional<APInt> DstUM;
// UM is perhaps unavailable, let's check
if (const SCEVConstant *UpperBound =
collectConstantUpperBound(DstLoop, Delta->getType())) {
DstUM = UpperBound->getAPInt();
- LLVM_DEBUG(dbgs() << "\t DstUM = " << DstUM << "\n");
- DstUMvalid = true;
+ LLVM_DEBUG(dbgs() << "\t DstUM = " << *DstUM << "\n");
}
APInt TU(APInt::getSignedMaxValue(Bits));
@@ -1996,47 +2035,39 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
LLVM_DEBUG(dbgs() << "\t TX = " << TX << "\n");
LLVM_DEBUG(dbgs() << "\t TY = " << TY << "\n");
- SmallVector<APInt, 2> TLVec, TUVec;
APInt TB = BM.sdiv(G);
- if (TB.sgt(0)) {
- TLVec.push_back(ceilingOfQuotient(-TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- if (SrcUMvalid) {
- TUVec.push_back(floorOfQuotient(SrcUM - TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- }
- } else {
- TUVec.push_back(floorOfQuotient(-TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- if (SrcUMvalid) {
- TLVec.push_back(ceilingOfQuotient(SrcUM - TX, TB));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- }
- }
-
APInt TA = AM.sdiv(G);
- if (TA.sgt(0)) {
- TLVec.push_back(ceilingOfQuotient(-TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- if (DstUMvalid) {
- TUVec.push_back(floorOfQuotient(DstUM - TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- }
- } else {
- TUVec.push_back(floorOfQuotient(-TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n");
- if (DstUMvalid) {
- TLVec.push_back(ceilingOfQuotient(DstUM - TY, TA));
- LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n");
- }
- }
- if (TLVec.empty() || TUVec.empty())
- return false;
+ // At this point, we have the following equations:
+ //
+ // TA*i - TB*j = TC
+ //
+ // Also, we know that the all pairs of (i, j) can be expressed as:
+ //
+ // (TX + k*TB, TY + k*TA)
+ //
+ // where k is an arbitrary integer.
+ auto [TL0, TU0] = inferDomainOfAffine(TB, TX, SrcUM);
+ auto [TL1, TU1] = inferDomainOfAffine(TA, TY, DstUM);
LLVM_DEBUG(dbgs() << "\t TA = " << TA << "\n");
LLVM_DEBUG(dbgs() << "\t TB = " << TB << "\n");
+ auto CreateVec = [](const std::optional<APInt> &V0,
+ const std::optional<APInt> &V1) {
+ SmallVector<APInt, 2> Vec;
+ if (V0)
+ Vec.push_back(*V0);
+ if (V1)
+ Vec.push_back(*V1);
+ return Vec;
+ };
+
+ SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1);
+ SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1);
+ if (TLVec.empty() || TUVec.empty())
+ return false;
+
TL = APIntOps::smax(TLVec.front(), TLVec.back());
TU = APIntOps::smin(TUVec.front(), TUVec.back());
LLVM_DEBUG(dbgs() << "\t TL = " << TL << "\n");
@@ -2345,6 +2376,43 @@ static std::optional<APInt> getConstantPart(const SCEV *Expr) {
return std::nullopt;
}
+bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
+ const Loop *CurLoop,
+ const SCEV *&CurLoopCoeff,
+ APInt &RunningGCD) const {
+ // If RunningGCD is already 1, exit early.
+ // TODO: It might be better to continue the recursion to find CurLoopCoeff.
+ if (RunningGCD == 1)
+ return true;
+
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+ if (!AddRec) {
+ assert(isLoopInvariant(Expr, CurLoop) &&
+ "Expected loop invariant expression");
+ return true;
+ }
+
+ assert(AddRec->isAffine() && "Unexpected Expr");
+ const SCEV *Start = AddRec->getStart();
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ if (AddRec->getLoop() == CurLoop) {
+ CurLoopCoeff = Step;
+ } else {
+ std::optional<APInt> ConstCoeff = getConstantPart(Step);
+
+ // If the coefficient is the product of a constant and other stuff, we can
+ // use the constant in the GCD computation.
+ if (!ConstCoeff)
+ return false;
+
+ // TODO: What happens if ConstCoeff is the "most negative" signed number
+ // (e.g. -128 for 8 bit wide APInt)?
+ RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
+ }
+
+ return accumulateCoefficientsGCD(Start, CurLoop, CurLoopCoeff, RunningGCD);
+}
+
//===----------------------------------------------------------------------===//
// gcdMIVtest -
// Tests an MIV subscript pair for dependence.
@@ -2464,40 +2532,11 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
RunningGCD = ExtraGCD;
const SCEV *SrcCoeff = AddRec->getStepRecurrence(*SE);
const SCEV *DstCoeff = SE->getMinusSCEV(SrcCoeff, SrcCoeff);
- const SCEV *Inner = Src;
- while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
- AddRec = cast<SCEVAddRecExpr>(Inner);
- const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
- if (CurLoop == AddRec->getLoop())
- ; // SrcCoeff == Coeff
- else {
- // If the coefficient is the product of a constant and other stuff,
- // we can use the constant in the GCD computation.
- std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
- if (!ConstCoeff)
- return false;
- RunningGCD =
- APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
- }
- Inner = AddRec->getStart();
- }
- Inner = Dst;
- while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
- AddRec = cast<SCEVAddRecExpr>(Inner);
- const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
- if (CurLoop == AddRec->getLoop())
- DstCoeff = Coeff;
- else {
- // If the coefficient is the product of a constant and other stuff,
- // we can use the constant in the GCD computation.
- std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
- if (!ConstCoeff)
- return false;
- RunningGCD =
- APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
- }
- Inner = AddRec->getStart();
- }
+
+ if (!accumulateCoefficientsGCD(Src, CurLoop, SrcCoeff, RunningGCD) ||
+ !accumulateCoefficientsGCD(Dst, CurLoop, DstCoeff, RunningGCD))
+ return false;
+
Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
// If the coefficient is the product of a constant and other stuff,
// we can use the constant in the GCD computation.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 8be5de3..b8c540c 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -40,6 +40,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
switch (Kind) {
default:
break;
+ case RecurKind::AddChainWithSubs:
+ case RecurKind::Sub:
case RecurKind::Add:
case RecurKind::Mul:
case RecurKind::Or:
@@ -897,8 +899,11 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
case Instruction::PHI:
return InstDesc(I, Prev.getRecKind(), Prev.getExactFPMathInst());
case Instruction::Sub:
+ return InstDesc(
+ Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs, I);
case Instruction::Add:
- return InstDesc(Kind == RecurKind::Add, I);
+ return InstDesc(
+ Kind == RecurKind::Add || Kind == RecurKind::AddChainWithSubs, I);
case Instruction::Mul:
return InstDesc(Kind == RecurKind::Mul, I);
case Instruction::And:
@@ -917,7 +922,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
I->hasAllowReassoc() ? nullptr : I);
case Instruction::Select:
if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul ||
- Kind == RecurKind::Add || Kind == RecurKind::Mul)
+ Kind == RecurKind::Add || Kind == RecurKind::Mul ||
+ Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs)
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
@@ -1003,6 +1009,17 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
return true;
}
+ if (AddReductionVar(Phi, RecurKind::Sub, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a SUB reduction PHI." << *Phi << "\n");
+ return true;
+ }
+ if (AddReductionVar(Phi, RecurKind::AddChainWithSubs, TheLoop, FMF, RedDes,
+ DB, AC, DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a chained ADD-SUB reduction PHI." << *Phi
+ << "\n");
+ return true;
+ }
if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
SE)) {
LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
@@ -1201,6 +1218,9 @@ bool RecurrenceDescriptor::isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop,
unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
switch (Kind) {
+ case RecurKind::Sub:
+ return Instruction::Sub;
+ case RecurKind::AddChainWithSubs:
case RecurKind::Add:
return Instruction::Add;
case RecurKind::Mul:
@@ -1288,6 +1308,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
if (isFMulAddIntrinsic(Cur))
return true;
+ if (Cur->getOpcode() == Instruction::Sub &&
+ Kind == RecurKind::AddChainWithSubs)
+ return true;
+
return Cur->getOpcode() == getOpcode();
};
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 22f4d08..757f689 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -180,6 +180,10 @@ static cl::opt<bool> DisableGEPConstOperand(
"disable-gep-const-evaluation", cl::Hidden, cl::init(false),
cl::desc("Disables evaluation of GetElementPtr with constant operands"));
+static cl::opt<bool> InlineAllViableCalls(
+ "inline-all-viable-calls", cl::Hidden, cl::init(false),
+ cl::desc("Inline all viable calls, even if they exceed the inlining "
+ "threshold"));
namespace llvm {
std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
if (Attr.isValid()) {
@@ -3272,6 +3276,10 @@ InlineCost llvm::getInlineCost(
return llvm::InlineCost::getNever(UserDecision->getFailureReason());
}
+ if (InlineAllViableCalls && isInlineViable(*Callee).isSuccess())
+ return llvm::InlineCost::getAlways(
+ "Inlining forced by -inline-all-viable-calls");
+
LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "... (caller:" << Call.getCaller()->getName()
<< ")\n");
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 922f25d..c7b0ca9 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -927,8 +927,13 @@ LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) {
// NOTE: We're currently limited by the set of operations that ConstantRange
// can evaluate symbolically. Enhancing that set will allows us to analyze
// more definitions.
- return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(),
- ResultBitWidth));
+ ConstantRange Res = ConstantRange::getEmpty(ResultBitWidth);
+ if (auto *Trunc = dyn_cast<TruncInst>(CI))
+ Res = LHSRange.truncate(ResultBitWidth, Trunc->getNoWrapKind());
+ else
+ Res = LHSRange.castOp(CI->getOpcode(), ResultBitWidth);
+
+ return ValueLatticeElement::getRange(Res);
}
std::optional<ValueLatticeElement>
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 78d0887..9a2c9ba 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -276,8 +276,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
// this function is only used when one address use dominates the
// other, which means that they'll always either have the same
// value or one of them will have an undefined value.
- if (isa<BinaryOperator>(A) || isa<CastInst>(A) || isa<PHINode>(A) ||
- isa<GetElementPtrInst>(A))
+ if (isa<CastInst>(A) || isa<PHINode>(A) || isa<GetElementPtrInst>(A))
if (const Instruction *BI = dyn_cast<Instruction>(B))
if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
return true;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a553533..bceddd0 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -936,6 +936,12 @@ private:
static std::optional<int64_t>
getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
Value *Ptr, PredicatedScalarEvolution &PSE) {
+ if (isa<ScalableVectorType>(AccessTy)) {
+ LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
+ << "\n");
+ return std::nullopt;
+ }
+
// The access function must stride over the innermost loop.
if (Lp != AR->getLoop()) {
LLVM_DEBUG({
@@ -1590,11 +1596,6 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
return 0;
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
- if (isa<ScalableVectorType>(AccessTy)) {
- LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
- << "\n");
- return std::nullopt;
- }
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
if (Assume && !AR)
@@ -2404,12 +2405,13 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
SmallVector<Instruction *, 4>
MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
MemAccessInfo Access(Ptr, IsWrite);
- auto &IndexVector = Accesses.find(Access)->second;
-
+ auto I = Accesses.find(Access);
SmallVector<Instruction *, 4> Insts;
- transform(IndexVector,
- std::back_inserter(Insts),
- [&](unsigned Idx) { return this->InstMap[Idx]; });
+ if (I != Accesses.end()) {
+ transform(I->second, std::back_inserter(Insts),
+ [&](unsigned Idx) { return this->InstMap[Idx]; });
+ }
+
return Insts;
}
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 518a634..6ba6073 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -58,14 +58,26 @@ static cl::opt<bool, true>
// Loop implementation
//
-bool Loop::isLoopInvariant(const Value *V) const {
- if (const Instruction *I = dyn_cast<Instruction>(V))
- return !contains(I);
+bool Loop::isLoopInvariant(const Value *V, bool HasCoroSuspendInst) const {
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // FIXME: this is semantically inconsistent. We're tracking a proper fix in
+ // issue #149604.
+ // If V is a pointer to stack object and L contains a coro.suspend function
+ // call, then V may not be loop invariant because the ramp function and
+ // resume function have different stack frames.
+ if (HasCoroSuspendInst && isa<AllocaInst>(I))
+ return false;
+ else
+ return !contains(I);
+ }
return true; // All non-instructions are loop invariant
}
-bool Loop::hasLoopInvariantOperands(const Instruction *I) const {
- return all_of(I->operands(), [this](Value *V) { return isLoopInvariant(V); });
+bool Loop::hasLoopInvariantOperands(const Instruction *I,
+ bool HasCoroSuspendInst) const {
+ return all_of(I->operands(), [&](Value *V) {
+ return isLoopInvariant(V, HasCoroSuspendInst);
+ });
}
bool Loop::makeLoopInvariant(Value *V, bool &Changed, Instruction *InsertPt,
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2b0f212..67c2cfa 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -150,6 +150,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
switch (II->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
+ Loc = MemoryLocation::getForArgument(II, 0, TLI);
+ // These intrinsics don't really modify the memory, but returning Mod
+ // will allow them to be handled conservatively.
+ return ModRefInfo::Mod;
case Intrinsic::invariant_start:
Loc = MemoryLocation::getForArgument(II, 1, TLI);
// These intrinsics don't really modify the memory, but returning Mod
@@ -441,11 +445,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
Intrinsic::ID ID = II->getIntrinsicID();
switch (ID) {
case Intrinsic::lifetime_start: {
- // FIXME: This only considers queries directly on the invariant-tagged
- // pointer, not on query pointers that are indexed off of them. It'd
- // be nice to handle that at some point (the right approach is to use
- // GetPointerBaseWithConstantOffset).
- MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1));
+ MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(0));
if (BatchAA.isMustAlias(ArgLoc, MemLoc))
return MemDepResult::getDef(II);
continue;
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 28a2640..72b643c 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -191,7 +191,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- assert(ArgIdx == 1 && "Invalid argument index");
+ assert(ArgIdx == 0 && "Invalid argument index");
auto *AI = dyn_cast<AllocaInst>(Arg);
if (!AI)
// lifetime of poison value.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 477e477..d2c445f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -7284,7 +7284,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
bool &Precise) {
Precise = true;
// Do a bounded search of the def relation of the requested SCEVs.
- SmallSet<const SCEV *, 16> Visited;
+ SmallPtrSet<const SCEV *, 16> Visited;
SmallVector<const SCEV *> Worklist;
auto pushOp = [&](const SCEV *S) {
if (!Visited.insert(S).second)
@@ -7435,7 +7435,15 @@ ScalarEvolution::getLoopProperties(const Loop *L) {
if (auto *SI = dyn_cast<StoreInst>(I))
return !SI->isSimple();
- return I->mayThrow() || I->mayWriteToMemory();
+ if (I->mayThrow())
+ return true;
+
+ // Non-volatile memset / memcpy do not count as side-effect for forward
+ // progress.
+ if (isa<MemIntrinsic>(I) && !I->isVolatile())
+ return false;
+
+ return I->mayWriteToMemory();
};
LoopProperties LP = {/* HasNoAbnormalExits */ true,
@@ -14944,6 +14952,29 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
if (!AddRec)
return nullptr;
+ // Check if any of the transformed predicates is known to be false. In that
+ // case, it doesn't make sense to convert to a predicated AddRec, as the
+ // versioned loop will never execute.
+ for (const SCEVPredicate *Pred : TransformPreds) {
+ auto *WrapPred = dyn_cast<SCEVWrapPredicate>(Pred);
+ if (!WrapPred || WrapPred->getFlags() != SCEVWrapPredicate::IncrementNSSW)
+ continue;
+
+ const SCEVAddRecExpr *AddRecToCheck = WrapPred->getExpr();
+ const SCEV *ExitCount = getBackedgeTakenCount(AddRecToCheck->getLoop());
+ if (isa<SCEVCouldNotCompute>(ExitCount))
+ continue;
+
+ const SCEV *Step = AddRecToCheck->getStepRecurrence(*this);
+ if (!Step->isOne())
+ continue;
+
+ ExitCount = getTruncateOrSignExtend(ExitCount, Step->getType());
+ const SCEV *Add = getAddExpr(AddRecToCheck->getStart(), ExitCount);
+ if (isKnownPredicate(CmpInst::ICMP_SLT, Add, AddRecToCheck->getStart()))
+ return nullptr;
+ }
+
// Since the transformation was successful, we can now transfer the SCEV
// predicates.
Preds.append(TransformPreds.begin(), TransformPreds.end());
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index abe4985..1e20fca 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -70,7 +70,7 @@ void StackLifetime::collectMarkers() {
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (!II || !II->isLifetimeStartOrEnd())
continue;
- const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+ const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
if (!AI)
continue;
auto It = AllocaNumbering.find(AI);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c7eb2ec..323ab8b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
return Cost;
}
+InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
+ unsigned Index) const {
+ InstructionCost Cost =
+ TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
InstructionCost TargetTransformInfo::getInsertExtractValueCost(
unsigned Opcode, TTI::TargetCostKind CostKind) const {
assert((Opcode == Instruction::InsertValue ||
@@ -1230,10 +1239,11 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
-InstructionCost
-TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
- const SCEV *Ptr) const {
- InstructionCost Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr);
+InstructionCost TargetTransformInfo::getAddressComputationCost(
+ Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
+ InstructionCost Cost =
+ TTIImpl->getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1e70228..21bdb2f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6356,27 +6356,6 @@ llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
return nullptr;
}
-bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
- unsigned CharSize) {
- // Make sure the GEP has exactly three arguments.
- if (GEP->getNumOperands() != 3)
- return false;
-
- // Make sure the index-ee is a pointer to array of \p CharSize integers.
- // CharSize.
- ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
- if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
- return false;
-
- // Check to make sure that the first operand of the GEP is an integer and
- // has value 0 so that we are sure we're indexing into the initializer.
- const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
- if (!FirstIdx || !FirstIdx->isZero())
- return false;
-
- return true;
-}
-
// If V refers to an initialized global constant, set Slice either to
// its initializer if the size of its elements equals ElementSize, or,
// for ElementSize == 8, to its representation as an array of unsiged
@@ -7415,8 +7394,10 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
case Intrinsic::fshr:
case Intrinsic::smax:
case Intrinsic::smin:
+ case Intrinsic::scmp:
case Intrinsic::umax:
case Intrinsic::umin:
+ case Intrinsic::ucmp:
case Intrinsic::ptrmask:
case Intrinsic::fptoui_sat:
case Intrinsic::fptosi_sat:
@@ -7785,7 +7766,7 @@ bool llvm::mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
// The set of all recursive users we've visited (which are assumed to all be
// poison because of said visit)
- SmallSet<const Value *, 16> KnownPoison;
+ SmallPtrSet<const Value *, 16> KnownPoison;
SmallVector<const Instruction*, 16> Worklist;
Worklist.push_back(Root);
while (!Worklist.empty()) {
@@ -8140,8 +8121,8 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
// Set of instructions that we have proved will yield poison if Inst
// does.
- SmallSet<const Value *, 16> YieldsPoison;
- SmallSet<const BasicBlock *, 4> Visited;
+ SmallPtrSet<const Value *, 16> YieldsPoison;
+ SmallPtrSet<const BasicBlock *, 4> Visited;
YieldsPoison.insert(V);
Visited.insert(BB);
@@ -9147,7 +9128,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst,
return false;
for (unsigned I = 0; I != 2; ++I) {
- if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) {
+ if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I));
+ Operation && Operation->getNumOperands() >= 2) {
Value *LHS = Operation->getOperand(0);
Value *RHS = Operation->getOperand(1);
if (LHS != PN && RHS != PN)
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a0..3d5bd61 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() {
INSTKEYWORD(fptoui, FPToUI);
INSTKEYWORD(fptosi, FPToSI);
INSTKEYWORD(inttoptr, IntToPtr);
+ INSTKEYWORD(ptrtoaddr, PtrToAddr);
INSTKEYWORD(ptrtoint, PtrToInt);
INSTKEYWORD(bitcast, BitCast);
INSTKEYWORD(addrspacecast, AddrSpaceCast);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 13bef1f..1bc2906 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
case lltok::kw_bitcast:
case lltok::kw_addrspacecast:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint: {
unsigned Opc = Lex.getUIntVal();
Type *DestTy = nullptr;
@@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
case lltok::kw_fptoui:
case lltok::kw_fptosi:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint:
return parseCast(Inst, PFS, KeywordVal);
case lltok::kw_fptrunc:
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index eb83945..36d10d0 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -60,17 +60,6 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
return ArrayRef(SigComponentTypes);
}
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
- {"SRV", llvm::dxil::ResourceClass::SRV},
- {"UAV", llvm::dxil::ResourceClass::UAV},
- {"CBV", llvm::dxil::ResourceClass::CBuffer},
- {"Sampler", llvm::dxil::ResourceClass::Sampler},
-};
-
-ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() {
- return ArrayRef(ResourceClassNames);
-}
-
static const EnumEntry<RootFlags> RootFlagNames[] = {
#define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum},
#include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
index 11598ee..b52f029 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocument.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
@@ -104,6 +104,10 @@ DocNode &DocNode::operator=(uint64_t Val) {
*this = getDocument()->getNode(Val);
return *this;
}
+DocNode &DocNode::operator=(double Val) {
+ *this = getDocument()->getNode(Val);
+ return *this;
+}
// A level in the document reading stack.
struct StackLevel {
@@ -293,6 +297,9 @@ void Document::writeToBlob(std::string &Blob) {
case Type::Binary:
MPWriter.write(Node.getBinary());
break;
+ case Type::Float:
+ MPWriter.write(Node.getFloat());
+ break;
case Type::Empty:
llvm_unreachable("unhandled empty msgpack node");
default:
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
index f1765d7..8076a26 100644
--- a/llvm/lib/BinaryFormat/SFrame.cpp
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -68,3 +68,11 @@ ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() {
};
return ArrayRef(FREOffsets);
}
+
+ArrayRef<EnumEntry<sframe::BaseReg>> sframe::getBaseRegisters() {
+ static constexpr EnumEntry<sframe::BaseReg> BaseRegs[] = {
+ {"FP", sframe::BaseReg::FP},
+ {"SP", sframe::BaseReg::SP},
+ };
+ return ArrayRef(BaseRegs);
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 290d873..22a0d0f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) {
case bitc::CAST_SITOFP : return Instruction::SIToFP;
case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
case bitc::CAST_FPEXT : return Instruction::FPExt;
+ case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr;
case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
case bitc::CAST_BITCAST : return Instruction::BitCast;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05680fa..a3f8254 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
case Instruction::SIToFP : return bitc::CAST_SITOFP;
case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
case Instruction::FPExt : return bitc::CAST_FPEXT;
+ case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR;
case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
case Instruction::BitCast : return bitc::CAST_BITCAST;
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
new file mode 100644
index 0000000..73646ad
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -0,0 +1,94 @@
+//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/Process.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+static StringRef getCASIDPrefix() { return "llvmcas://"; }
+void BuiltinCASContext::anchor() {}
+
+Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) {
+ if (!Reference.consume_front(getCASIDPrefix()))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid cas-id '" + Reference + "'");
+
+ // FIXME: Allow shortened references?
+ if (Reference.size() != 2 * sizeof(HashType))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "wrong size for cas-id hash '" + Reference + "'");
+
+ std::string Binary;
+ if (!tryGetFromHex(Reference, Binary))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid hash in cas-id '" + Reference + "'");
+
+ assert(Binary.size() == sizeof(HashType));
+ HashType Digest;
+ llvm::copy(Binary, Digest.data());
+ return Digest;
+}
+
+Expected<CASID> BuiltinCAS::parseID(StringRef Reference) {
+ Expected<HashType> Digest = BuiltinCASContext::parseID(Reference);
+ if (!Digest)
+ return Digest.takeError();
+
+ return CASID::create(&getContext(), toStringRef(*Digest));
+}
+
+void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) {
+ SmallString<64> Hash;
+ toHex(Digest, /*LowerCase=*/true, Hash);
+ OS << getCASIDPrefix() << Hash;
+}
+
+void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const {
+ BuiltinCASContext::printID(ID.getHash(), OS);
+}
+
+const BuiltinCASContext &BuiltinCASContext::getDefaultContext() {
+ static BuiltinCASContext DefaultContext;
+ return DefaultContext;
+}
+
+Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data),
+ Refs, Data);
+}
+
+Error BuiltinCAS::validate(const CASID &ID) {
+ auto Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+
+ auto Handle = load(*Ref);
+ if (!Handle)
+ return Handle.takeError();
+
+ auto Proxy = ObjectProxy::load(*this, *Ref, *Handle);
+ SmallVector<ObjectRef> Refs;
+ if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }))
+ return E;
+
+ ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size());
+ auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data);
+ if (!ID.getHash().equals(Hash))
+ return createCorruptObjectError(ID);
+
+ return Error::success();
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
new file mode 100644
index 0000000..3b5374d
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -0,0 +1,74 @@
+//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_BUILTINCAS_H
+#define LLVM_LIB_CAS_BUILTINCAS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/ObjectStore.h"
+
+namespace llvm::cas {
+class ActionCache;
+namespace builtin {
+
+/// Common base class for builtin CAS implementations using the same CASContext.
+class BuiltinCAS : public ObjectStore {
+public:
+ BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {}
+
+ Expected<CASID> parseID(StringRef Reference) final;
+
+ Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) = 0;
+
+ virtual Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ return storeImpl(ComputedHash, {}, ArrayRef(Map.data(), Map.size()));
+ }
+
+ /// Both builtin CAS implementations provide lifetime for free, so this can
+ /// be const, and readData() and getDataSize() can be implemented on top of
+ /// it.
+ virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0;
+
+ ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator) const final {
+ // BuiltinCAS Objects are always null terminated.
+ return getDataConst(Node);
+ }
+ uint64_t getDataSize(ObjectHandle Node) const final {
+ return getDataConst(Node).size();
+ }
+
+ Error createUnknownObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+ }
+
+ Error createCorruptObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt object '" + ID.toString() + "'");
+ }
+
+ Error createCorruptStorageError() const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt storage");
+ }
+
+ Error validate(const CASID &ID) final;
+};
+
+} // end namespace builtin
+} // end namespace llvm::cas
+
+#endif // LLVM_LIB_CAS_BUILTINCAS_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
new file mode 100644
index 0000000..b2825a1
--- /dev/null
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_component_library(LLVMCAS
+ BuiltinCAS.cpp
+ InMemoryCAS.cpp
+ ObjectStore.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
+
+ LINK_COMPONENTS
+ Support
+)
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
new file mode 100644
index 0000000..255b89c
--- /dev/null
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -0,0 +1,326 @@
+//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/LazyAtomicPointer.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ThreadSafeAllocator.h"
+#include "llvm/Support/TrailingObjects.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class InMemoryObject;
+
+/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses
+/// LazyAtomicPointer to coordinate creation of objects.
+using InMemoryIndexT =
+ ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>,
+ sizeof(HashType)>;
+
+/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access
+/// their hash.
+using InMemoryIndexValueT = InMemoryIndexT::value_type;
+
+/// Builtin InMemory CAS that stores CAS object in the memory.
+class InMemoryObject {
+public:
+ enum class Kind {
+ /// Node with refs and data.
+ RefNode,
+
+ /// Node with refs and data co-allocated.
+ InlineNode,
+
+ Max = InlineNode,
+ };
+
+ Kind getKind() const { return IndexAndKind.getInt(); }
+ const InMemoryIndexValueT &getIndex() const {
+ assert(IndexAndKind.getPointer());
+ return *IndexAndKind.getPointer();
+ }
+
+ ArrayRef<uint8_t> getHash() const { return getIndex().Hash; }
+
+ InMemoryObject() = delete;
+ InMemoryObject(InMemoryObject &&) = delete;
+ InMemoryObject(const InMemoryObject &) = delete;
+
+protected:
+ InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
+
+private:
+ enum Counts : int {
+ NumKindBits = 2,
+ };
+ PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind;
+ static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT),
+ "Kind will clobber pointer");
+ static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated");
+
+public:
+ ArrayRef<char> getData() const;
+
+ ArrayRef<const InMemoryObject *> getRefs() const;
+};
+
+class InMemoryRefObject final : public InMemoryObject {
+public:
+ static constexpr Kind KindValue = Kind::RefNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+
+ ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; }
+ ArrayRef<const InMemoryObject *> getRefs() const { return Refs; }
+ ArrayRef<char> getDataImpl() const { return Data; }
+ ArrayRef<char> getData() const { return Data; }
+
+ static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryRefObject));
+ return *new (Mem) InMemoryRefObject(I, Refs, Data);
+ }
+
+private:
+ InMemoryRefObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) {
+ assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment");
+ assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment");
+ assert(*Data.end() == 0 && "Expected null-termination");
+ }
+
+ ArrayRef<const InMemoryObject *> Refs;
+ ArrayRef<char> Data;
+};
+
+class InMemoryInlineObject final
+ : public InMemoryObject,
+ public TrailingObjects<InMemoryInlineObject, const InMemoryObject *,
+ char> {
+public:
+ static constexpr Kind KindValue = Kind::InlineNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+
+ ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); }
+ ArrayRef<const InMemoryObject *> getRefsImpl() const {
+ return ArrayRef(getTrailingObjects<const InMemoryObject *>(), NumRefs);
+ }
+
+ ArrayRef<char> getData() const { return getDataImpl(); }
+ ArrayRef<char> getDataImpl() const {
+ return ArrayRef(getTrailingObjects<char>(), DataSize);
+ }
+
+ static InMemoryInlineObject &
+ create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryInlineObject) +
+ sizeof(uintptr_t) * Refs.size() + Data.size() + 1);
+ return *new (Mem) InMemoryInlineObject(I, Refs, Data);
+ }
+
+ size_t numTrailingObjects(OverloadToken<const InMemoryObject *>) const {
+ return NumRefs;
+ }
+
+private:
+ InMemoryInlineObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), NumRefs(Refs.size()),
+ DataSize(Data.size()) {
+ auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1);
+ llvm::copy(Refs, BeginRefs);
+ auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs);
+ llvm::copy(Data, BeginData);
+ BeginData[Data.size()] = 0;
+ }
+ uint32_t NumRefs;
+ uint32_t DataSize;
+};
+
+/// In-memory CAS database and action cache (the latter should be separated).
+class InMemoryCAS : public BuiltinCAS {
+public:
+ Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+
+ Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) override;
+
+ CASID getID(const InMemoryIndexValueT &I) const {
+ StringRef Hash = toStringRef(I.Hash);
+ return CASID::create(&getContext(), Hash);
+ }
+ CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); }
+
+ ObjectHandle getObjectHandle(const InMemoryObject &Node) const {
+ assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL));
+ return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node));
+ }
+
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override {
+ return getObjectHandle(asInMemoryObject(Ref));
+ }
+
+ InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) {
+ return *Index.insertLazy(
+ Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); });
+ }
+
+ /// TODO: Consider callers to actually do an insert and to return a handle to
+ /// the slot in the trie.
+ const InMemoryObject *getInMemoryObject(CASID ID) const {
+ assert(ID.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Expected ID from same hash schema");
+ if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash()))
+ return P->Data;
+ return nullptr;
+ }
+
+ const InMemoryObject &getInMemoryObject(ObjectHandle OH) const {
+ return *reinterpret_cast<const InMemoryObject *>(
+ (uintptr_t)OH.getInternalRef(*this));
+ }
+
+ const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const {
+ uintptr_t P = Ref.getInternalRef(*this);
+ return *reinterpret_cast<const InMemoryObject *>(P);
+ }
+ ObjectRef toReference(const InMemoryObject &O) const {
+ return makeObjectRef(reinterpret_cast<uintptr_t>(&O));
+ }
+
+ CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); }
+ CASID getIDImpl(ReferenceBase Ref) const {
+ return getID(asInMemoryObject(Ref));
+ }
+
+ std::optional<ObjectRef> getReference(const CASID &ID) const final {
+ if (const InMemoryObject *Object = getInMemoryObject(ID))
+ return toReference(*Object);
+ return std::nullopt;
+ }
+
+ Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; }
+
+ ArrayRef<char> getDataConst(ObjectHandle Node) const final {
+ return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
+ }
+
+ InMemoryCAS() = default;
+
+private:
+ size_t getNumRefs(ObjectHandle Node) const final {
+ return getInMemoryObject(Node).getRefs().size();
+ }
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+ return toReference(*getInMemoryObject(Node).getRefs()[I]);
+ }
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+
+ /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr
+ /// as a convenient way to store hashes.
+ ///
+ /// - Insert nullptr on lookups.
+ /// - InMemoryObject points back to here.
+ InMemoryIndexT Index;
+
+ ThreadSafeAllocator<BumpPtrAllocator> Objects;
+ ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>>
+ MemoryMaps;
+};
+
+} // end anonymous namespace
+
+ArrayRef<char> InMemoryObject::getData() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getDataImpl();
+ return cast<InMemoryInlineObject>(this)->getDataImpl();
+}
+
+ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getRefsImpl();
+ return cast<InMemoryInlineObject>(this)->getRefsImpl();
+}
+
+Expected<ObjectRef>
+InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ ArrayRef<char> Data(Map.data(), Map.size());
+ auto &I = indexHash(ComputedHash);
+
+ // Load or generate.
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryRefObject::create(Allocator, I, {}, Data);
+ };
+ const InMemoryObject &Node =
+ cast<InMemoryObject>(I.Data.loadOrGenerate(Generator));
+
+ // Save Map if the winning node uses it.
+ if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node))
+ if (RefNode->getData().data() == Map.data())
+ new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map));
+
+ return toReference(Node);
+}
+
+Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ auto &I = indexHash(ComputedHash);
+
+ // Create the node.
+ SmallVector<const InMemoryObject *> InternalRefs;
+ for (ObjectRef Ref : Refs)
+ InternalRefs.push_back(&asInMemoryObject(Ref));
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data);
+ };
+ return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)));
+}
+
+Error InMemoryCAS::forEachRef(ObjectHandle Handle,
+ function_ref<Error(ObjectRef)> Callback) const {
+ auto &Node = getInMemoryObject(Handle);
+ for (const InMemoryObject *Ref : Node.getRefs())
+ if (Error E = Callback(toReference(*Ref)))
+ return E;
+ return Error::success();
+}
+
+std::unique_ptr<ObjectStore> cas::createInMemoryCAS() {
+ return std::make_unique<InMemoryCAS>();
+}
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
new file mode 100644
index 0000000..e0be50b
--- /dev/null
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -0,0 +1,162 @@
+//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <optional>
+
+using namespace llvm;
+using namespace llvm::cas;
+
+void CASContext::anchor() {}
+void ObjectStore::anchor() {}
+
+LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
+
+std::string CASID::toString() const {
+ std::string S;
+ raw_string_ostream(S) << *this;
+ return S;
+}
+
+static void printReferenceBase(raw_ostream &OS, StringRef Kind,
+ uint64_t InternalRef, std::optional<CASID> ID) {
+ OS << Kind << "=" << InternalRef;
+ if (ID)
+ OS << "[" << *ID << "]";
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const {
+ assert(this == &This);
+ printReferenceBase(OS, "object-handle", InternalRef, std::nullopt);
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const {
+ assert(this == &This);
+
+ std::optional<CASID> ID;
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ if (CAS)
+ ID = CAS->getID(This);
+#endif
+ printReferenceBase(OS, "object-ref", InternalRef, ID);
+}
+
+Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) {
+ std::optional<ObjectHandle> Handle;
+ if (Error E = loadIfExists(Ref).moveInto(Handle))
+ return std::move(E);
+ if (!Handle)
+ return createStringError(errc::invalid_argument,
+ "missing object '" + getID(Ref).toString() + "'");
+ return *Handle;
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name,
+ bool RequiresNullTerminator) {
+ return MemoryBuffer::getMemBuffer(
+ toStringRef(getData(Node, RequiresNullTerminator)), Name,
+ RequiresNullTerminator);
+}
+
+void ObjectStore::readRefs(ObjectHandle Node,
+ SmallVectorImpl<ObjectRef> &Refs) const {
+ consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }));
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) {
+ std::optional<ObjectRef> Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+
+ return getProxy(*Ref);
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = load(Ref).moveInto(H))
+ return std::move(E);
+
+ return ObjectProxy::load(*this, Ref, *H);
+}
+
+Expected<std::optional<ObjectProxy>>
+ObjectStore::getProxyIfExists(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = loadIfExists(Ref).moveInto(H))
+ return std::move(E);
+ if (!H)
+ return std::nullopt;
+ return ObjectProxy::load(*this, Ref, *H);
+}
+
+Error ObjectStore::createUnknownObjectError(const CASID &ID) {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+}
+
+Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs,
+ StringRef Data) {
+ Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data));
+ if (!Ref)
+ return Ref.takeError();
+ return getProxy(*Ref);
+}
+
+Expected<ObjectRef>
+ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status) {
+ // TODO: For the on-disk CAS implementation use cloning to store it as a
+ // standalone file if the file-system supports it and the file is large.
+ uint64_t Size = Status ? Status->getSize() : -1;
+ auto Buffer = MemoryBuffer::getOpenFile(FD, /*Filename=*/"", Size);
+ if (!Buffer)
+ return errorCodeToError(Buffer.getError());
+
+ return store({}, arrayRefFromStringRef<char>((*Buffer)->getBuffer()));
+}
+
+Error ObjectStore::validateTree(ObjectRef Root) {
+ SmallDenseSet<ObjectRef> ValidatedRefs;
+ SmallVector<ObjectRef, 16> RefsToValidate;
+ RefsToValidate.push_back(Root);
+
+ while (!RefsToValidate.empty()) {
+ ObjectRef Ref = RefsToValidate.pop_back_val();
+ auto [I, Inserted] = ValidatedRefs.insert(Ref);
+ if (!Inserted)
+ continue; // already validated.
+ if (Error E = validate(getID(Ref)))
+ return E;
+ Expected<ObjectHandle> Obj = load(Ref);
+ if (!Obj)
+ return Obj.takeError();
+ if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error {
+ RefsToValidate.push_back(R);
+ return Error::success();
+ }))
+ return E;
+ }
+ return Error::success();
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectProxy::getMemoryBuffer(StringRef Name,
+ bool RequiresNullTerminator) const {
+ return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator);
+}
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index a561830..a943297 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(FileCheck)
add_subdirectory(InterfaceStub)
add_subdirectory(IRPrinter)
add_subdirectory(IRReader)
+add_subdirectory(CAS)
add_subdirectory(CGData)
add_subdirectory(CodeGen)
add_subdirectory(CodeGenTypes)
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e7b9417..2ef96cc 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -69,18 +69,10 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
return CurIndex + 1;
}
-/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
-/// EVTs that represent all the individual underlying
-/// non-aggregate types that comprise it.
-///
-/// If Offsets is non-null, it points to a vector to be filled in
-/// with the in-memory offsets of each of the individual values.
-///
-void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
- Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<EVT> *MemVTs,
- SmallVectorImpl<TypeSize> *Offsets,
- TypeSize StartingOffset) {
+void llvm::ComputeValueTypes(const DataLayout &DL, Type *Ty,
+ SmallVectorImpl<Type *> &Types,
+ SmallVectorImpl<TypeSize> *Offsets,
+ TypeSize StartingOffset) {
assert((Ty->isScalableTy() == StartingOffset.isScalable() ||
StartingOffset.isZero()) &&
"Offset/TypeSize mismatch!");
@@ -90,15 +82,13 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
// us to support structs with scalable vectors for operations that don't
// need offsets.
const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
- for (StructType::element_iterator EB = STy->element_begin(),
- EI = EB,
+ for (StructType::element_iterator EB = STy->element_begin(), EI = EB,
EE = STy->element_end();
EI != EE; ++EI) {
// Don't compute the element offset if we didn't get a StructLayout above.
TypeSize EltOffset =
SL ? SL->getElementOffset(EI - EB) : TypeSize::getZero();
- ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets,
- StartingOffset + EltOffset);
+ ComputeValueTypes(DL, *EI, Types, Offsets, StartingOffset + EltOffset);
}
return;
}
@@ -107,21 +97,39 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
Type *EltTy = ATy->getElementType();
TypeSize EltSize = DL.getTypeAllocSize(EltTy);
for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
- ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets,
- StartingOffset + i * EltSize);
+ ComputeValueTypes(DL, EltTy, Types, Offsets,
+ StartingOffset + i * EltSize);
return;
}
// Interpret void as zero return values.
if (Ty->isVoidTy())
return;
- // Base case: we can get an EVT for this LLVM IR type.
- ValueVTs.push_back(TLI.getValueType(DL, Ty));
- if (MemVTs)
- MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+ Types.push_back(Ty);
if (Offsets)
Offsets->push_back(StartingOffset);
}
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+ Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+ SmallVectorImpl<EVT> *MemVTs,
+ SmallVectorImpl<TypeSize> *Offsets,
+ TypeSize StartingOffset) {
+ SmallVector<Type *> Types;
+ ComputeValueTypes(DL, Ty, Types, Offsets, StartingOffset);
+ for (Type *Ty : Types) {
+ ValueVTs.push_back(TLI.getValueType(DL, Ty));
+ if (MemVTs)
+ MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+ }
+}
+
void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
SmallVectorImpl<EVT> *MemVTs,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c72b6e8..23a3543 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
break; // Error
}
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt: {
const DataLayout &DL = getDataLayout();
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3f3d5dc9..278dd65 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1915,7 +1915,6 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
// TODO: the "order" argument type is "int", not int32. So
// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
- ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
Constant *OrderingVal =
ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
@@ -2012,7 +2011,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
if (CASExpected) {
AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
AllocaCASExpected->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaCASExpected);
Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
Args.push_back(AllocaCASExpected);
}
@@ -2026,7 +2025,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
} else {
AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
AllocaValue->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaValue, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaValue);
Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
Args.push_back(AllocaValue);
}
@@ -2036,7 +2035,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
if (!CASExpected && HasResult && !UseSizedLibcall) {
AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
AllocaResult->setAlignment(AllocaAlignment);
- Builder.CreateLifetimeStart(AllocaResult, SizeVal64);
+ Builder.CreateLifetimeStart(AllocaResult);
Args.push_back(AllocaResult);
}
@@ -2069,7 +2068,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
// And then, extract the results...
if (ValueOperand && !UseSizedLibcall)
- Builder.CreateLifetimeEnd(AllocaValue, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaValue);
if (CASExpected) {
// The final result from the CAS is {load of 'expected' alloca, bool result
@@ -2078,7 +2077,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
Value *V = PoisonValue::get(FinalResultTy);
Value *ExpectedOut = Builder.CreateAlignedLoad(
CASExpected->getType(), AllocaCASExpected, AllocaAlignment);
- Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaCASExpected);
V = Builder.CreateInsertValue(V, ExpectedOut, 0);
V = Builder.CreateInsertValue(V, Result, 1);
I->replaceAllUsesWith(V);
@@ -2089,7 +2088,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
- Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);
+ Builder.CreateLifetimeEnd(AllocaResult);
}
I->replaceAllUsesWith(V);
}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index dcfd9aa..7292bc2 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1787,10 +1787,18 @@ ReoptimizeBlock:
// below were performed for EH "FallThrough" blocks. Therefore, even if
// that appears not to be happening anymore, we should assume that it is
// possible and not remove the "!FallThrough()->isEHPad" condition below.
+ //
+ // Similarly, the analyzeBranch call does not consider callbr, which also
+ // introduces the possibility of infinite rotation, as there may be
+ // multiple successors of PrevBB. Thus we check such case by
+ // FallThrough->isInlineAsmBrIndirectTarget().
+ // NOTE: Checking if PrevBB contains callbr is more precise, but much
+ // more expensive.
MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
SmallVector<MachineOperand, 4> PrevCond;
- if (FallThrough != MF.end() &&
- !FallThrough->isEHPad() &&
+
+ if (FallThrough != MF.end() && !FallThrough->isEHPad() &&
+ !FallThrough->isInlineAsmBrIndirectTarget() &&
!TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
PrevBB.isSuccessor(&*FallThrough)) {
MBB->moveAfter(&MF.back());
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
index b71e781..df34331 100644
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -89,7 +89,7 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
- if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this))
+ if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Ins[i].OrigTy, *this))
report_fatal_error("unable to allocate function argument #" + Twine(i));
}
}
@@ -102,7 +102,7 @@ bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+ if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this))
return false;
}
return true;
@@ -116,7 +116,7 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+ if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this))
report_fatal_error("unable to allocate function return #" + Twine(i));
}
}
@@ -129,7 +129,8 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
for (unsigned i = 0; i != NumOps; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+ if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy,
+ *this)) {
#ifndef NDEBUG
dbgs() << "Call operand #" << i << " has unhandled type "
<< ArgVT << '\n';
@@ -142,12 +143,13 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
/// Same as above except it takes vectors of types and argument flags.
void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+ SmallVectorImpl<Type *> &OrigTys,
CCAssignFn Fn) {
unsigned NumOps = ArgVTs.size();
for (unsigned i = 0; i != NumOps; ++i) {
MVT ArgVT = ArgVTs[i];
ISD::ArgFlagsTy ArgFlags = Flags[i];
- if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+ if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, OrigTys[i], *this)) {
#ifndef NDEBUG
dbgs() << "Call operand #" << i << " has unhandled type "
<< ArgVT << '\n';
@@ -164,7 +166,7 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
MVT VT = Ins[i].VT;
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+ if (Fn(i, VT, VT, CCValAssign::Full, Flags, Ins[i].OrigTy, *this)) {
#ifndef NDEBUG
dbgs() << "Call result #" << i << " has unhandled type "
<< VT << '\n';
@@ -175,8 +177,8 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
}
/// Same as above except it's specialized for calls that produce a single value.
-void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
- if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+void CCState::AnalyzeCallResult(MVT VT, Type *OrigTy, CCAssignFn Fn) {
+ if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), OrigTy, *this)) {
#ifndef NDEBUG
dbgs() << "Call result has unhandled type "
<< VT << '\n';
@@ -213,7 +215,8 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCRegister> &Regs,
// location in memory.
bool HaveRegParm;
do {
- if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+ Type *OrigTy = EVT(VT).getTypeForEVT(Context);
+ if (Fn(0, VT, VT, CCValAssign::Full, Flags, OrigTy, *this)) {
#ifndef NDEBUG
dbgs() << "Call has unhandled type " << VT
<< " while computing remaining regparms\n";
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9223739..0e40a92 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -377,7 +377,7 @@ public:
/// to be optimized again.
/// Note: Consider building time in this pass, when a BB updated, we need
/// to insert such BB into FreshBBs for huge function.
- SmallSet<BasicBlock *, 32> FreshBBs;
+ SmallPtrSet<BasicBlock *, 32> FreshBBs;
void releaseMemory() {
// Clear per function information.
@@ -1105,7 +1105,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
/// Replace all old uses with new ones, and push the updated BBs into FreshBBs.
static void replaceAllUsesWith(Value *Old, Value *New,
- SmallSet<BasicBlock *, 32> &FreshBBs,
+ SmallPtrSet<BasicBlock *, 32> &FreshBBs,
bool IsHuge) {
auto *OldI = dyn_cast<Instruction>(Old);
if (OldI) {
@@ -2135,7 +2135,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
const LoopInfo *LI,
- SmallSet<BasicBlock *, 32> &FreshBBs,
+ SmallPtrSet<BasicBlock *, 32> &FreshBBs,
bool IsHuge) {
Value *AddOffset, *RemAmt, *AddInst;
PHINode *LoopIncrPN;
@@ -2534,11 +2534,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
///
/// If the transform is performed, return true and set ModifiedDT to true.
-static bool despeculateCountZeros(IntrinsicInst *CountZeros,
- LoopInfo &LI,
+static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI,
const TargetLowering *TLI,
const DataLayout *DL, ModifyDT &ModifiedDT,
- SmallSet<BasicBlock *, 32> &FreshBBs,
+ SmallPtrSet<BasicBlock *, 32> &FreshBBs,
bool IsHugeFunc) {
// If a zero input is undefined, it doesn't make sense to despeculate that.
if (match(CountZeros->getOperand(1), m_One()))
@@ -4351,7 +4350,7 @@ private:
PhiNodeSet &PhiNodesToMatch) {
SmallVector<PHIPair, 8> WorkList;
Matcher.insert({PHI, Candidate});
- SmallSet<PHINode *, 8> MatchedPHIs;
+ SmallPtrSet<PHINode *, 8> MatchedPHIs;
MatchedPHIs.insert(PHI);
WorkList.push_back({PHI, Candidate});
SmallSet<PHIPair, 8> Visited;
@@ -8635,7 +8634,7 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
}
static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
- SmallSet<BasicBlock *, 32> &FreshBBs,
+ SmallPtrSet<BasicBlock *, 32> &FreshBBs,
bool IsHugeFunc) {
// Try and convert
// %c = icmp ult %x, 8
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 9b2851e..cd21e25 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -67,6 +67,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -108,6 +109,42 @@ static bool isNeg(Value *V);
static Value *getNegOperand(Value *V);
namespace {
+struct ComplexValue {
+ Value *Real = nullptr;
+ Value *Imag = nullptr;
+
+ bool operator==(const ComplexValue &Other) const {
+ return Real == Other.Real && Imag == Other.Imag;
+ }
+};
+hash_code hash_value(const ComplexValue &Arg) {
+ return hash_combine(DenseMapInfo<Value *>::getHashValue(Arg.Real),
+ DenseMapInfo<Value *>::getHashValue(Arg.Imag));
+}
+} // end namespace
+typedef SmallVector<struct ComplexValue, 2> ComplexValues;
+
+namespace llvm {
+template <> struct DenseMapInfo<ComplexValue> {
+ static inline ComplexValue getEmptyKey() {
+ return {DenseMapInfo<Value *>::getEmptyKey(),
+ DenseMapInfo<Value *>::getEmptyKey()};
+ }
+ static inline ComplexValue getTombstoneKey() {
+ return {DenseMapInfo<Value *>::getTombstoneKey(),
+ DenseMapInfo<Value *>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const ComplexValue &Val) {
+ return hash_combine(DenseMapInfo<Value *>::getHashValue(Val.Real),
+ DenseMapInfo<Value *>::getHashValue(Val.Imag));
+ }
+ static bool isEqual(const ComplexValue &LHS, const ComplexValue &RHS) {
+ return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
+ }
+};
+} // end namespace llvm
+
+namespace {
template <typename T, typename IterT>
std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
@@ -145,7 +182,13 @@ struct ComplexDeinterleavingCompositeNode {
ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
Value *R, Value *I)
- : Operation(Op), Real(R), Imag(I) {}
+ : Operation(Op) {
+ Vals.push_back({R, I});
+ }
+
+ ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
+ ComplexValues &Other)
+ : Operation(Op), Vals(Other) {}
private:
friend class ComplexDeinterleavingGraph;
@@ -155,8 +198,7 @@ private:
public:
ComplexDeinterleavingOperation Operation;
- Value *Real;
- Value *Imag;
+ ComplexValues Vals;
// This two members are required exclusively for generating
// ComplexDeinterleavingOperation::Symmetric operations.
@@ -192,10 +234,12 @@ public:
};
OS << "- CompositeNode: " << this << "\n";
- OS << " Real: ";
- PrintValue(Real);
- OS << " Imag: ";
- PrintValue(Imag);
+ for (unsigned I = 0; I < Vals.size(); I++) {
+ OS << " Real(" << I << ") : ";
+ PrintValue(Vals[I].Real);
+ OS << " Imag(" << I << ") : ";
+ PrintValue(Vals[I].Imag);
+ }
OS << " ReplacementNode: ";
PrintValue(ReplacementNode);
OS << " Operation: " << (int)Operation << "\n";
@@ -233,14 +277,16 @@ public:
};
explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
- const TargetLibraryInfo *TLI)
- : TL(TL), TLI(TLI) {}
+ const TargetLibraryInfo *TLI,
+ unsigned Factor)
+ : TL(TL), TLI(TLI), Factor(Factor) {}
private:
const TargetLowering *TL = nullptr;
const TargetLibraryInfo *TLI = nullptr;
+ unsigned Factor;
SmallVector<NodePtr> CompositeNodes;
- DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult;
+ DenseMap<ComplexValues, NodePtr> CachedResult;
SmallPtrSet<Instruction *, 16> FinalInstructions;
@@ -305,10 +351,25 @@ private:
I);
}
+ NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation,
+ ComplexValues &Vals) {
+#ifndef NDEBUG
+ for (auto &V : Vals) {
+ assert(
+ ((Operation != ComplexDeinterleavingOperation::ReductionPHI &&
+ Operation != ComplexDeinterleavingOperation::ReductionOperation) ||
+ (V.Real && V.Imag)) &&
+ "Reduction related nodes must have Real and Imaginary parts");
+ }
+#endif
+ return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation,
+ Vals);
+ }
+
NodePtr submitCompositeNode(NodePtr Node) {
CompositeNodes.push_back(Node);
- if (Node->Real)
- CachedResult[{Node->Real, Node->Imag}] = Node;
+ if (Node->Vals[0].Real)
+ CachedResult[Node->Vals] = Node;
return Node;
}
@@ -340,11 +401,17 @@ private:
/// 270: r: ar + bi
/// i: ai - br
NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
- NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+ NodePtr identifySymmetricOperation(ComplexValues &Vals);
NodePtr identifyPartialReduction(Value *R, Value *I);
NodePtr identifyDotProduct(Value *Inst);
- NodePtr identifyNode(Value *R, Value *I);
+ NodePtr identifyNode(ComplexValues &Vals);
+
+ NodePtr identifyNode(Value *R, Value *I) {
+ ComplexValues Vals;
+ Vals.push_back({R, I});
+ return identifyNode(Vals);
+ }
/// Determine if a sum of complex numbers can be formed from \p RealAddends
/// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -388,15 +455,16 @@ private:
/// operation:
/// * Using two shufflevectors with even indices for /pReal instruction and
/// odd indices for /pImag instructions (only for fixed-width vectors)
- /// * Using two extractvalue instructions applied to `vector.deinterleave2`
- /// intrinsic (for both fixed and scalable vectors)
- NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
+ /// * Using N extractvalue instructions applied to `vector.deinterleaveN`
+ /// intrinsics (for both fixed and scalable vectors) where N is a multiple of
+ /// 2.
+ NodePtr identifyDeinterleave(ComplexValues &Vals);
/// identifying the operation that represents a complex number repeated in a
/// Splat vector. There are two possible types of splats: ConstantExpr with
/// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an
/// initialization mask with all values set to zero.
- NodePtr identifySplat(Value *Real, Value *Imag);
+ NodePtr identifySplat(ComplexValues &Vals);
NodePtr identifyPHINode(Instruction *Real, Instruction *Imag);
@@ -447,7 +515,7 @@ public:
bool runOnFunction(Function &F);
private:
- bool evaluateBasicBlock(BasicBlock *B);
+ bool evaluateBasicBlock(BasicBlock *B, unsigned Factor);
const TargetLowering *TL = nullptr;
const TargetLibraryInfo *TLI = nullptr;
@@ -500,7 +568,15 @@ bool ComplexDeinterleaving::runOnFunction(Function &F) {
bool Changed = false;
for (auto &B : F)
- Changed |= evaluateBasicBlock(&B);
+ Changed |= evaluateBasicBlock(&B, 2);
+
+ // TODO: Permit changes for both interleave factors in the same function.
+ if (!Changed) {
+ for (auto &B : F)
+ Changed |= evaluateBasicBlock(&B, 4);
+ }
+
+ // TODO: We can also support interleave factors of 6 and 8 if needed.
return Changed;
}
@@ -545,8 +621,8 @@ Value *getNegOperand(Value *V) {
return I->getOperand(1);
}
-bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
- ComplexDeinterleavingGraph Graph(TL, TLI);
+bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B, unsigned Factor) {
+ ComplexDeinterleavingGraph Graph(TL, TLI, Factor);
if (Graph.collectPotentialReductions(B))
Graph.identifyReductionNodes();
@@ -669,6 +745,7 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real,
Instruction *Imag) {
LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
<< "\n");
+
// Determine rotation
auto IsAdd = [](unsigned Op) {
return Op == Instruction::FAdd || Op == Instruction::Add;
@@ -865,43 +942,57 @@ static bool isInstructionPotentiallySymmetric(Instruction *I) {
}
ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
- Instruction *Imag) {
- if (Real->getOpcode() != Imag->getOpcode())
- return nullptr;
+ComplexDeinterleavingGraph::identifySymmetricOperation(ComplexValues &Vals) {
+ auto *FirstReal = cast<Instruction>(Vals[0].Real);
+ unsigned FirstOpc = FirstReal->getOpcode();
+ for (auto &V : Vals) {
+ auto *Real = cast<Instruction>(V.Real);
+ auto *Imag = cast<Instruction>(V.Imag);
+ if (Real->getOpcode() != FirstOpc || Imag->getOpcode() != FirstOpc)
+ return nullptr;
- if (!isInstructionPotentiallySymmetric(Real) ||
- !isInstructionPotentiallySymmetric(Imag))
- return nullptr;
+ if (!isInstructionPotentiallySymmetric(Real) ||
+ !isInstructionPotentiallySymmetric(Imag))
+ return nullptr;
- auto *R0 = Real->getOperand(0);
- auto *I0 = Imag->getOperand(0);
+ if (isa<FPMathOperator>(FirstReal))
+ if (Real->getFastMathFlags() != FirstReal->getFastMathFlags() ||
+ Imag->getFastMathFlags() != FirstReal->getFastMathFlags())
+ return nullptr;
+ }
- NodePtr Op0 = identifyNode(R0, I0);
+ ComplexValues OpVals;
+ for (auto &V : Vals) {
+ auto *R0 = cast<Instruction>(V.Real)->getOperand(0);
+ auto *I0 = cast<Instruction>(V.Imag)->getOperand(0);
+ OpVals.push_back({R0, I0});
+ }
+
+ NodePtr Op0 = identifyNode(OpVals);
NodePtr Op1 = nullptr;
if (Op0 == nullptr)
return nullptr;
- if (Real->isBinaryOp()) {
- auto *R1 = Real->getOperand(1);
- auto *I1 = Imag->getOperand(1);
- Op1 = identifyNode(R1, I1);
+ if (FirstReal->isBinaryOp()) {
+ OpVals.clear();
+ for (auto &V : Vals) {
+ auto *R1 = cast<Instruction>(V.Real)->getOperand(1);
+ auto *I1 = cast<Instruction>(V.Imag)->getOperand(1);
+ OpVals.push_back({R1, I1});
+ }
+ Op1 = identifyNode(OpVals);
if (Op1 == nullptr)
return nullptr;
}
- if (isa<FPMathOperator>(Real) &&
- Real->getFastMathFlags() != Imag->getFastMathFlags())
- return nullptr;
-
- auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
- Real, Imag);
- Node->Opcode = Real->getOpcode();
- if (isa<FPMathOperator>(Real))
- Node->Flags = Real->getFastMathFlags();
+ auto Node =
+ prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Vals);
+ Node->Opcode = FirstReal->getOpcode();
+ if (isa<FPMathOperator>(FirstReal))
+ Node->Flags = FirstReal->getFastMathFlags();
Node->addOperand(Op0);
- if (Real->isBinaryOp())
+ if (FirstReal->isBinaryOp())
Node->addOperand(Op1);
return submitCompositeNode(Node);
@@ -909,7 +1000,6 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
-
if (!TL->isComplexDeinterleavingOperationSupported(
ComplexDeinterleavingOperation::CDot, V->getType())) {
LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
@@ -1054,65 +1144,77 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
}
ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
- auto It = CachedResult.find({R, I});
+ComplexDeinterleavingGraph::identifyNode(ComplexValues &Vals) {
+ auto It = CachedResult.find(Vals);
if (It != CachedResult.end()) {
LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
return It->second;
}
- if (NodePtr CN = identifyPartialReduction(R, I))
- return CN;
-
- bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
- if (!IsReduction && R->getType() != I->getType())
- return nullptr;
+ if (Vals.size() == 1) {
+ assert(Factor == 2 && "Can only handle interleave factors of 2");
+ Value *R = Vals[0].Real;
+ Value *I = Vals[0].Imag;
+ if (NodePtr CN = identifyPartialReduction(R, I))
+ return CN;
+ bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+ if (!IsReduction && R->getType() != I->getType())
+ return nullptr;
+ }
- if (NodePtr CN = identifySplat(R, I))
+ if (NodePtr CN = identifySplat(Vals))
return CN;
- auto *Real = dyn_cast<Instruction>(R);
- auto *Imag = dyn_cast<Instruction>(I);
- if (!Real || !Imag)
- return nullptr;
+ for (auto &V : Vals) {
+ auto *Real = dyn_cast<Instruction>(V.Real);
+ auto *Imag = dyn_cast<Instruction>(V.Imag);
+ if (!Real || !Imag)
+ return nullptr;
+ }
- if (NodePtr CN = identifyDeinterleave(Real, Imag))
+ if (NodePtr CN = identifyDeinterleave(Vals))
return CN;
- if (NodePtr CN = identifyPHINode(Real, Imag))
- return CN;
+ if (Vals.size() == 1) {
+ assert(Factor == 2 && "Can only handle interleave factors of 2");
+ auto *Real = dyn_cast<Instruction>(Vals[0].Real);
+ auto *Imag = dyn_cast<Instruction>(Vals[0].Imag);
+ if (NodePtr CN = identifyPHINode(Real, Imag))
+ return CN;
- if (NodePtr CN = identifySelectNode(Real, Imag))
- return CN;
+ if (NodePtr CN = identifySelectNode(Real, Imag))
+ return CN;
- auto *VTy = cast<VectorType>(Real->getType());
- auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+ auto *VTy = cast<VectorType>(Real->getType());
+ auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
- bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation::CMulPartial, NewVTy);
- bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation::CAdd, NewVTy);
+ bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+ bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation::CAdd, NewVTy);
- if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
- if (NodePtr CN = identifyPartialMul(Real, Imag))
- return CN;
- }
+ if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+ if (NodePtr CN = identifyPartialMul(Real, Imag))
+ return CN;
+ }
- if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
- if (NodePtr CN = identifyAdd(Real, Imag))
- return CN;
- }
+ if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+ if (NodePtr CN = identifyAdd(Real, Imag))
+ return CN;
+ }
- if (HasCMulSupport && HasCAddSupport) {
- if (NodePtr CN = identifyReassocNodes(Real, Imag))
- return CN;
+ if (HasCMulSupport && HasCAddSupport) {
+ if (NodePtr CN = identifyReassocNodes(Real, Imag)) {
+ return CN;
+ }
+ }
}
- if (NodePtr CN = identifySymmetricOperation(Real, Imag))
+ if (NodePtr CN = identifySymmetricOperation(Vals))
return CN;
LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n");
- CachedResult[{R, I}] = nullptr;
+ CachedResult[Vals] = nullptr;
return nullptr;
}
@@ -1256,9 +1358,10 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
return nullptr;
}
assert(FinalNode && "FinalNode can not be nullptr here");
+ assert(FinalNode->Vals.size() == 1);
// Set the Real and Imag fields of the final node and submit it
- FinalNode->Real = Real;
- FinalNode->Imag = Imag;
+ FinalNode->Vals[0].Real = Real;
+ FinalNode->Vals[0].Imag = Imag;
submitCompositeNode(FinalNode);
return FinalNode;
}
@@ -1381,7 +1484,7 @@ ComplexDeinterleavingGraph::identifyMultiplications(
auto NodeA = It->second;
auto NodeB = PMI.Node;
- auto IsMultiplicandReal = PMI.Common == NodeA->Real;
+ auto IsMultiplicandReal = PMI.Common == NodeA->Vals[0].Real;
// The following table illustrates the relationship between multiplications
// and rotations. If we consider the multiplication (X + iY) * (U + iV), we
// can see:
@@ -1423,10 +1526,10 @@ ComplexDeinterleavingGraph::identifyMultiplications(
LLVM_DEBUG({
dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n";
- dbgs().indent(4) << "X: " << *NodeA->Real << "\n";
- dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n";
- dbgs().indent(4) << "U: " << *NodeB->Real << "\n";
- dbgs().indent(4) << "V: " << *NodeB->Imag << "\n";
+ dbgs().indent(4) << "X: " << *NodeA->Vals[0].Real << "\n";
+ dbgs().indent(4) << "Y: " << *NodeA->Vals[0].Imag << "\n";
+ dbgs().indent(4) << "U: " << *NodeB->Vals[0].Real << "\n";
+ dbgs().indent(4) << "V: " << *NodeB->Vals[0].Imag << "\n";
dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
});
@@ -1595,10 +1698,13 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
ComplexDeinterleavingOperation::ReductionOperation ||
RootNode->Operation ==
ComplexDeinterleavingOperation::ReductionSingle);
+ assert(RootNode->Vals.size() == 1 &&
+ "Cannot handle reductions involving multiple complex values");
// Find out which part, Real or Imag, comes later, and only if we come to
// the latest part, add it to OrderedRoots.
- auto *R = cast<Instruction>(RootNode->Real);
- auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+ auto *R = cast<Instruction>(RootNode->Vals[0].Real);
+ auto *I = RootNode->Vals[0].Imag ? cast<Instruction>(RootNode->Vals[0].Imag)
+ : nullptr;
Instruction *ReplacementAnchor;
if (I)
@@ -1631,6 +1737,8 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
bool FoundPotentialReduction = false;
+ if (Factor != 2)
+ return false;
auto *Br = dyn_cast<BranchInst>(B->getTerminator());
if (!Br || Br->getNumSuccessors() != 2)
@@ -1682,6 +1790,8 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
}
void ComplexDeinterleavingGraph::identifyReductionNodes() {
+ assert(Factor == 2 && "Cannot handle multiple complex values");
+
SmallVector<bool> Processed(ReductionInfo.size(), false);
SmallVector<Instruction *> OperationInstruction;
for (auto &P : ReductionInfo)
@@ -1771,11 +1881,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
}
bool ComplexDeinterleavingGraph::checkNodes() {
-
bool FoundDeinterleaveNode = false;
for (NodePtr N : CompositeNodes) {
if (!N->areOperandsValid())
return false;
+
if (N->Operation == ComplexDeinterleavingOperation::Deinterleave)
FoundDeinterleaveNode = true;
}
@@ -1861,17 +1971,33 @@ bool ComplexDeinterleavingGraph::checkNodes() {
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
- if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
+ if (Intrinsic::getInterleaveIntrinsicID(Factor) !=
+ Intrinsic->getIntrinsicID())
return nullptr;
- auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
- auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1));
- if (!Real || !Imag)
- return nullptr;
+ ComplexValues Vals;
+ for (unsigned I = 0; I < Factor; I += 2) {
+ auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(I));
+ auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(I + 1));
+ if (!Real || !Imag)
+ return nullptr;
+ Vals.push_back({Real, Imag});
+ }
- return identifyNode(Real, Imag);
+ ComplexDeinterleavingGraph::NodePtr Node1 = identifyNode(Vals);
+ if (!Node1)
+ return nullptr;
+ return Node1;
}
+ // TODO: We could also add support for fixed-width interleave factors of 4
+ // and above, but currently for symmetric operations the interleaves and
+ // deinterleaves are already removed by VectorCombine. If we extend this to
+ // permit complex multiplications, reductions, etc. then we should also add
+ // support for fixed-width here.
+ if (Factor != 2)
+ return nullptr;
+
auto *SVI = dyn_cast<ShuffleVectorInst>(RootI);
if (!SVI)
return nullptr;
@@ -1890,22 +2016,52 @@ ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
}
ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
- Instruction *Imag) {
- Instruction *I = nullptr;
- Value *FinalValue = nullptr;
- if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
- match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
- match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
- m_Value(FinalValue)))) {
+ComplexDeinterleavingGraph::identifyDeinterleave(ComplexValues &Vals) {
+ Instruction *II = nullptr;
+
+ // Must be at least one complex value.
+ auto CheckExtract = [&](Value *V, unsigned ExpectedIdx,
+ Instruction *ExpectedInsn) -> ExtractValueInst * {
+ auto *EVI = dyn_cast<ExtractValueInst>(V);
+ if (!EVI || EVI->getNumIndices() != 1 ||
+ EVI->getIndices()[0] != ExpectedIdx ||
+ !isa<Instruction>(EVI->getAggregateOperand()) ||
+ (ExpectedInsn && ExpectedInsn != EVI->getAggregateOperand()))
+ return nullptr;
+ return EVI;
+ };
+
+ for (unsigned Idx = 0; Idx < Vals.size(); Idx++) {
+ ExtractValueInst *RealEVI = CheckExtract(Vals[Idx].Real, Idx * 2, II);
+ if (RealEVI && Idx == 0)
+ II = cast<Instruction>(RealEVI->getAggregateOperand());
+ if (!RealEVI || !CheckExtract(Vals[Idx].Imag, (Idx * 2) + 1, II)) {
+ II = nullptr;
+ break;
+ }
+ }
+
+ if (auto *IntrinsicII = dyn_cast_or_null<IntrinsicInst>(II)) {
+ if (IntrinsicII->getIntrinsicID() !=
+ Intrinsic::getDeinterleaveIntrinsicID(2 * Vals.size()))
+ return nullptr;
+
+ // The remaining should match too.
NodePtr PlaceholderNode = prepareCompositeNode(
- llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
- PlaceholderNode->ReplacementNode = FinalValue;
- FinalInstructions.insert(Real);
- FinalInstructions.insert(Imag);
+ llvm::ComplexDeinterleavingOperation::Deinterleave, Vals);
+ PlaceholderNode->ReplacementNode = II->getOperand(0);
+ for (auto &V : Vals) {
+ FinalInstructions.insert(cast<Instruction>(V.Real));
+ FinalInstructions.insert(cast<Instruction>(V.Imag));
+ }
return submitCompositeNode(PlaceholderNode);
}
+ if (Vals.size() != 1)
+ return nullptr;
+
+ Value *Real = Vals[0].Real;
+ Value *Imag = Vals[0].Imag;
auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
if (!RealShuffle || !ImagShuffle) {
@@ -1999,7 +2155,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
}
ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
+ComplexDeinterleavingGraph::identifySplat(ComplexValues &Vals) {
auto IsSplat = [](Value *V) -> bool {
// Fixed-width vector with constants
if (isa<ConstantDataVector>(V))
@@ -2033,24 +2189,39 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
return all_equal(Mask) && Mask[0] == 0;
};
- if (!IsSplat(R) || !IsSplat(I))
- return nullptr;
-
- auto *Real = dyn_cast<Instruction>(R);
- auto *Imag = dyn_cast<Instruction>(I);
- if ((!Real && Imag) || (Real && !Imag))
- return nullptr;
+ // The splats must meet the following requirements:
+ // 1. Must either be all instructions or all values.
+ // 2. Non-constant splats must live in the same block.
+ if (auto *FirstValAsInstruction = dyn_cast<Instruction>(Vals[0].Real)) {
+ BasicBlock *FirstBB = FirstValAsInstruction->getParent();
+ for (auto &V : Vals) {
+ if (!IsSplat(V.Real) || !IsSplat(V.Imag))
+ return nullptr;
- if (Real && Imag) {
- // Non-constant splats should be in the same basic block
- if (Real->getParent() != Imag->getParent())
- return nullptr;
+ auto *Real = dyn_cast<Instruction>(V.Real);
+ auto *Imag = dyn_cast<Instruction>(V.Imag);
+ if (!Real || !Imag || Real->getParent() != FirstBB ||
+ Imag->getParent() != FirstBB)
+ return nullptr;
+ }
+ } else {
+ for (auto &V : Vals) {
+ if (!IsSplat(V.Real) || !IsSplat(V.Imag) || isa<Instruction>(V.Real) ||
+ isa<Instruction>(V.Imag))
+ return nullptr;
+ }
+ }
- FinalInstructions.insert(Real);
- FinalInstructions.insert(Imag);
+ for (auto &V : Vals) {
+ auto *Real = dyn_cast<Instruction>(V.Real);
+ auto *Imag = dyn_cast<Instruction>(V.Imag);
+ if (Real && Imag) {
+ FinalInstructions.insert(Real);
+ FinalInstructions.insert(Imag);
+ }
}
NodePtr PlaceholderNode =
- prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I);
+ prepareCompositeNode(ComplexDeinterleavingOperation::Splat, Vals);
return submitCompositeNode(PlaceholderNode);
}
@@ -2186,24 +2357,35 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
llvm_unreachable("Deinterleave node should already have ReplacementNode");
break;
case ComplexDeinterleavingOperation::Splat: {
- auto *R = dyn_cast<Instruction>(Node->Real);
- auto *I = dyn_cast<Instruction>(Node->Imag);
+ SmallVector<Value *> Ops;
+ for (auto &V : Node->Vals) {
+ Ops.push_back(V.Real);
+ Ops.push_back(V.Imag);
+ }
+ auto *R = dyn_cast<Instruction>(Node->Vals[0].Real);
+ auto *I = dyn_cast<Instruction>(Node->Vals[0].Imag);
if (R && I) {
// Splats that are not constant are interleaved where they are located
- Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
+ Instruction *InsertPoint = R;
+ for (auto V : Node->Vals) {
+ if (InsertPoint->comesBefore(cast<Instruction>(V.Real)))
+ InsertPoint = cast<Instruction>(V.Real);
+ if (InsertPoint->comesBefore(cast<Instruction>(V.Imag)))
+ InsertPoint = cast<Instruction>(V.Imag);
+ }
+ InsertPoint = InsertPoint->getNextNode();
IRBuilder<> IRB(InsertPoint);
- ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
+ ReplacementNode = IRB.CreateVectorInterleave(Ops);
} else {
- ReplacementNode =
- Builder.CreateVectorInterleave({Node->Real, Node->Imag});
+ ReplacementNode = Builder.CreateVectorInterleave(Ops);
}
break;
}
case ComplexDeinterleavingOperation::ReductionPHI: {
// If Operation is ReductionPHI, a new empty PHINode is created.
// It is filled later when the ReductionOperation is processed.
- auto *OldPHI = cast<PHINode>(Node->Real);
- auto *VTy = cast<VectorType>(Node->Real->getType());
+ auto *OldPHI = cast<PHINode>(Node->Vals[0].Real);
+ auto *VTy = cast<VectorType>(Node->Vals[0].Real->getType());
auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
OldToNewPHI[OldPHI] = NewPHI;
@@ -2219,8 +2401,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
processReductionOperation(ReplacementNode, Node);
break;
case ComplexDeinterleavingOperation::ReductionSelect: {
- auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0);
- auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
+ auto *MaskReal = cast<Instruction>(Node->Vals[0].Real)->getOperand(0);
+ auto *MaskImag = cast<Instruction>(Node->Vals[0].Imag)->getOperand(0);
auto *A = replaceNode(Builder, Node->Operands[0]);
auto *B = replaceNode(Builder, Node->Operands[1]);
auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
@@ -2237,7 +2419,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
void ComplexDeinterleavingGraph::processReductionSingle(
Value *OperationReplacement, RawNodePtr Node) {
- auto *Real = cast<Instruction>(Node->Real);
+ auto *Real = cast<Instruction>(Node->Vals[0].Real);
auto *OldPHI = ReductionInfo[Real].first;
auto *NewPHI = OldToNewPHI[OldPHI];
auto *VTy = cast<VectorType>(Real->getType());
@@ -2269,8 +2451,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
void ComplexDeinterleavingGraph::processReductionOperation(
Value *OperationReplacement, RawNodePtr Node) {
- auto *Real = cast<Instruction>(Node->Real);
- auto *Imag = cast<Instruction>(Node->Imag);
+ auto *Real = cast<Instruction>(Node->Vals[0].Real);
+ auto *Imag = cast<Instruction>(Node->Vals[0].Imag);
auto *OldPHIReal = ReductionInfo[Real].first;
auto *OldPHIImag = ReductionInfo[Imag].first;
auto *NewPHI = OldToNewPHI[OldPHIReal];
@@ -2318,15 +2500,15 @@ void ComplexDeinterleavingGraph::replaceNodes() {
if (RootNode->Operation ==
ComplexDeinterleavingOperation::ReductionOperation) {
- auto *RootReal = cast<Instruction>(RootNode->Real);
- auto *RootImag = cast<Instruction>(RootNode->Imag);
+ auto *RootReal = cast<Instruction>(RootNode->Vals[0].Real);
+ auto *RootImag = cast<Instruction>(RootNode->Vals[0].Imag);
ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
DeadInstrRoots.push_back(RootReal);
DeadInstrRoots.push_back(RootImag);
} else if (RootNode->Operation ==
ComplexDeinterleavingOperation::ReductionSingle) {
- auto *RootInst = cast<Instruction>(RootNode->Real);
+ auto *RootInst = cast<Instruction>(RootNode->Vals[0].Real);
auto &Info = ReductionInfo[RootInst];
Info.first->removeIncomingValue(BackEdge);
DeadInstrRoots.push_back(Info.second);
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index d8e3f5f..753c656 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -508,8 +508,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
Value *VScale = Builder.CreateVScale(Int32Ty, "vscale");
- MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size",
- /*NUW*/ true, /*NSW*/ false);
+ MaxEVL = Builder.CreateNUWMul(VScale, FactorConst, "scalable_size");
} else {
MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 0f3ec8b..90a18b86 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1099,7 +1099,7 @@ bool CallLowering::checkReturn(CCState &CCInfo,
CCAssignFn *Fn) const {
for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
MVT VT = MVT::getVT(Outs[I].Ty);
- if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo))
+ if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], Outs[I].Ty, CCInfo))
return false;
}
return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e84ba91..8163dea 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1821,10 +1821,29 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
return false;
}
+ // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+ // inbounds, reaching the same result in one G_PTR_ADD is also inbounds.
+ // The nusw constraints are satisfied because imm1+imm2 cannot exceed the
+ // largest signed integer that fits into the index type, which is the maximum
+ // size of allocated objects according to the IR Language Reference.
+ unsigned PtrAddFlags = MI.getFlags();
+ unsigned LHSPtrAddFlags = Add2Def->getFlags();
+ bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+ bool IsInBounds =
+ PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds;
+ unsigned Flags = 0;
+ if (IsNoUWrap)
+ Flags |= MachineInstr::MIFlag::NoUWrap;
+ if (IsInBounds) {
+ Flags |= MachineInstr::MIFlag::InBounds;
+ Flags |= MachineInstr::MIFlag::NoUSWrap;
+ }
+
// Pass the combined immediate to the apply function.
MatchInfo.Imm = AMNew.BaseOffs;
MatchInfo.Base = Base;
MatchInfo.Bank = getRegBank(Imm2);
+ MatchInfo.Flags = Flags;
return true;
}
@@ -1838,6 +1857,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
Observer.changingInstr(MI);
MI.getOperand(1).setReg(MatchInfo.Base);
MI.getOperand(2).setReg(NewOffset.getReg(0));
+ MI.setFlags(MatchInfo.Flags);
Observer.changedInstr(MI);
}
@@ -4871,14 +4891,34 @@ bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
if (!C2)
return false;
+ // If both additions are nuw, the reassociated additions are also nuw.
+ // If the original G_PTR_ADD is additionally nusw, X and C are both not
+ // negative, so BASE+X is between BASE and BASE+(X+C). The new G_PTR_ADDs are
+ // therefore also nusw.
+ // If the original G_PTR_ADD is additionally inbounds (which implies nusw),
+ // the new G_PTR_ADDs are then also inbounds.
+ unsigned PtrAddFlags = MI.getFlags();
+ unsigned AddFlags = RHS->getFlags();
+ bool IsNoUWrap = PtrAddFlags & AddFlags & MachineInstr::MIFlag::NoUWrap;
+ bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::NoUSWrap);
+ bool IsInBounds = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::InBounds);
+ unsigned Flags = 0;
+ if (IsNoUWrap)
+ Flags |= MachineInstr::MIFlag::NoUWrap;
+ if (IsNoUSWrap)
+ Flags |= MachineInstr::MIFlag::NoUSWrap;
+ if (IsInBounds)
+ Flags |= MachineInstr::MIFlag::InBounds;
+
MatchInfo = [=, &MI](MachineIRBuilder &B) {
LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
auto NewBase =
- Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
+ Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg(), Flags);
Observer.changingInstr(MI);
MI.getOperand(1).setReg(NewBase.getReg(0));
MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
+ MI.setFlags(Flags);
Observer.changedInstr(MI);
};
return !reassociationCanBreakAddressingModePattern(MI);
@@ -4897,6 +4937,25 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
return false;
auto *LHSPtrAdd = cast<GPtrAdd>(LHS);
+
+ // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+ // nuw and inbounds (which implies nusw), the offsets are both non-negative,
+ // so the new G_PTR_ADDs are also inbounds.
+ unsigned PtrAddFlags = MI.getFlags();
+ unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags();
+ bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+ bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags &
+ MachineInstr::MIFlag::NoUSWrap);
+ bool IsInBounds = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags &
+ MachineInstr::MIFlag::InBounds);
+ unsigned Flags = 0;
+ if (IsNoUWrap)
+ Flags |= MachineInstr::MIFlag::NoUWrap;
+ if (IsNoUSWrap)
+ Flags |= MachineInstr::MIFlag::NoUSWrap;
+ if (IsInBounds)
+ Flags |= MachineInstr::MIFlag::InBounds;
+
MatchInfo = [=, &MI](MachineIRBuilder &B) {
// When we change LHSPtrAdd's offset register we might cause it to use a reg
// before its def. Sink the instruction so the outer PTR_ADD to ensure this
@@ -4907,9 +4966,11 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
auto NewCst = B.buildConstant(MRI.getType(RHSReg), LHSCstOff->Value);
Observer.changingInstr(MI);
MI.getOperand(2).setReg(NewCst.getReg(0));
+ MI.setFlags(Flags);
Observer.changedInstr(MI);
Observer.changingInstr(*LHSPtrAdd);
LHSPtrAdd->getOperand(2).setReg(RHSReg);
+ LHSPtrAdd->setFlags(Flags);
Observer.changedInstr(*LHSPtrAdd);
};
return !reassociationCanBreakAddressingModePattern(MI);
@@ -4933,11 +4994,30 @@ bool CombinerHelper::matchReassocFoldConstantsInSubTree(
if (!C2)
return false;
+ // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are
+ // inbounds, reaching the same result in one G_PTR_ADD is also inbounds.
+ // The nusw constraints are satisfied because imm1+imm2 cannot exceed the
+ // largest signed integer that fits into the index type, which is the maximum
+ // size of allocated objects according to the IR Language Reference.
+ unsigned PtrAddFlags = MI.getFlags();
+ unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags();
+ bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap;
+ bool IsInBounds =
+ PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds;
+ unsigned Flags = 0;
+ if (IsNoUWrap)
+ Flags |= MachineInstr::MIFlag::NoUWrap;
+ if (IsInBounds) {
+ Flags |= MachineInstr::MIFlag::InBounds;
+ Flags |= MachineInstr::MIFlag::NoUSWrap;
+ }
+
MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
Observer.changingInstr(MI);
MI.getOperand(1).setReg(LHSSrc1);
MI.getOperand(2).setReg(NewCst.getReg(0));
+ MI.setFlags(Flags);
Observer.changedInstr(MI);
};
return !reassociationCanBreakAddressingModePattern(MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index bbfae57..8424a81 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2209,7 +2209,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
: TargetOpcode::LIFETIME_END;
- const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1));
+ const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(0));
if (!AI || !AI->isStaticAlloca())
return true;
@@ -2522,6 +2522,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
Opc = ID == Intrinsic::vector_reduce_fadd
? TargetOpcode::G_VECREDUCE_SEQ_FADD
: TargetOpcode::G_VECREDUCE_SEQ_FMUL;
+ if (!MRI->getType(VecSrc).isVector())
+ Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_FADD
+ : TargetOpcode::G_FMUL;
MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc},
MachineInstr::copyFlagsFromInstruction(CI));
return true;
@@ -2556,6 +2559,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
return true;
case Intrinsic::amdgcn_cs_chain:
+ case Intrinsic::amdgcn_call_whole_wave:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
@@ -2786,11 +2790,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
if (CI.isInlineAsm())
return translateInlineAsm(CI, MIRBuilder);
- diagnoseDontCall(CI);
-
Intrinsic::ID ID = F ? F->getIntrinsicID() : Intrinsic::not_intrinsic;
- if (!F || ID == Intrinsic::not_intrinsic)
- return translateCallBase(CI, MIRBuilder);
+ if (!F || ID == Intrinsic::not_intrinsic) {
+ if (translateCallBase(CI, MIRBuilder)) {
+ diagnoseDontCall(CI);
+ return true;
+ }
+ return false;
+ }
assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
@@ -3513,7 +3520,7 @@ void IRTranslator::finishPendingPhis() {
Verifier.setCurrentInst(PI);
#endif // ifndef NDEBUG
- SmallSet<const MachineBasicBlock *, 16> SeenPreds;
+ SmallPtrSet<const MachineBasicBlock *, 16> SeenPreds;
for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
auto IRPred = PI->getIncomingBlock(i);
ArrayRef<Register> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i));
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d9d3569..008c188 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5574,12 +5574,19 @@ LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
unsigned NewElemCount =
NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
- LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
-
- // Split the Src and Dst Reg into smaller registers
SmallVector<Register> SrcVRegs, BitcastVRegs;
- if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
- return UnableToLegalize;
+ if (NewElemCount == 1) {
+ LLT SrcNarrowTy = SrcTy.getElementType();
+
+ auto Unmerge = MIRBuilder.buildUnmerge(SrcNarrowTy, SrcReg);
+ getUnmergeResults(SrcVRegs, *Unmerge);
+ } else {
+ LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
+
+ // Split the Src and Dst Reg into smaller registers
+ if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
+ return UnableToLegalize;
+ }
// Build new smaller bitcast instructions
// Not supporting Leftover types for now but will have to
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 8955dd0..e41fd81 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1869,8 +1869,10 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
case TargetOpcode::G_FSHR:
case TargetOpcode::G_SMAX:
case TargetOpcode::G_SMIN:
+ case TargetOpcode::G_SCMP:
case TargetOpcode::G_UMAX:
case TargetOpcode::G_UMIN:
+ case TargetOpcode::G_UCMP:
case TargetOpcode::G_PTRMASK:
case TargetOpcode::G_SADDO:
case TargetOpcode::G_SSUBO:
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5e50898..93f6e39 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -268,13 +268,16 @@ static Value *getMaskOperand(IntrinsicInst *II) {
}
}
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor,
- ElementCount LeafValueEC);
-
-static Value *getMask(Value *WideMask, unsigned Factor,
- VectorType *LeafValueTy) {
+// Return a pair of
+// (1) The corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+// (2) Some mask effectively skips a certain field, and this element is a mask
+// in which inactive lanes represent fields that are skipped (i.e. "gaps").
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+ ElementCount LeafValueEC);
+
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+ VectorType *LeafValueTy) {
return getMask(WideMask, Factor, LeafValueTy->getElementCount());
}
@@ -379,22 +382,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
Value *Mask = nullptr;
+ auto GapMask = APInt::getAllOnes(Factor);
if (LI) {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor, VecTy);
+ std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy);
if (!Mask)
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
<< *Load << "\n");
+ LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+ << " and actual factor " << GapMask.popcount() << "\n");
}
// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
- Indices, Factor))
+ Indices, Factor, GapMask))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
@@ -536,10 +542,15 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
- Mask = getMask(getMaskOperand(II), Factor,
- ElementCount::getFixed(LaneMaskLen));
+ APInt GapMask(Factor, 0);
+ std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
+ ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;
+ // We haven't supported gap mask for stores. Yet it is possible that we
+ // already changed the IR, hence returning true here.
+ if (GapMask.popcount() != Factor)
+ return true;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
<< *Store << "\n");
@@ -556,34 +567,97 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
return true;
}
-static Value *getMask(Value *WideMask, unsigned Factor,
- ElementCount LeafValueEC) {
+// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
+// last field in a factor-of-three interleaved store or deinterleaved load (in
+// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
+// This helper function tries to detect this pattern and return the actual
+// factor we're accessing, which is 2 in this example.
+static void getGapMask(const Constant &MaskConst, unsigned Factor,
+ unsigned LeafMaskLen, APInt &GapMask) {
+ assert(GapMask.getBitWidth() == Factor);
+ for (unsigned F = 0U; F < Factor; ++F) {
+ bool AllZero = true;
+ for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+ Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
+ if (!C->isZeroValue()) {
+ AllZero = false;
+ break;
+ }
+ }
+ // All mask bits on this field are zero, skipping it.
+ if (AllZero)
+ GapMask.clearBit(F);
+ }
+}
+
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+ ElementCount LeafValueEC) {
+ auto GapMask = APInt::getAllOnes(Factor);
+
if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
- F && F == Factor && llvm::all_equal(IMI->args())) {
- return IMI->getArgOperand(0);
+ F && F == Factor) {
+ Value *RefArg = nullptr;
+ // Check if all the intrinsic arguments are the same, except those that
+ // are zeros, which we mark as gaps in the gap mask.
+ for (auto [Idx, Arg] : enumerate(IMI->args())) {
+ if (auto *C = dyn_cast<Constant>(Arg); C && C->isZeroValue()) {
+ GapMask.clearBit(Idx);
+ continue;
+ }
+
+ if (!RefArg)
+ RefArg = Arg;
+ else if (RefArg != Arg)
+ return {nullptr, GapMask};
+ }
+
+ // In a very rare occasion, all the intrinsic arguments might be zeros,
+ // in which case we still want to return an all-zeros constant instead of
+ // nullptr.
+ return {RefArg ? RefArg : IMI->getArgOperand(0), GapMask};
}
}
+ // Masks that are assembled from bitwise AND.
+ if (auto *AndOp = dyn_cast<BinaryOperator>(WideMask);
+ AndOp && AndOp->getOpcode() == Instruction::And) {
+ auto [MaskLHS, GapMaskLHS] =
+ getMask(AndOp->getOperand(0), Factor, LeafValueEC);
+ auto [MaskRHS, GapMaskRHS] =
+ getMask(AndOp->getOperand(1), Factor, LeafValueEC);
+ if (!MaskLHS || !MaskRHS)
+ return {nullptr, GapMask};
+ // Using IRBuilder here so that any trivial constants could be folded right
+ // away.
+ return {IRBuilder<>(AndOp).CreateAnd(MaskLHS, MaskRHS),
+ GapMaskLHS & GapMaskRHS};
+ }
+
if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
if (auto *Splat = ConstMask->getSplatValue())
// All-ones or all-zeros mask.
- return ConstantVector::getSplat(LeafValueEC, Splat);
+ return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask};
if (LeafValueEC.isFixed()) {
unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+ // First, check if we use a gap mask to skip some of the factors / fields.
+ getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask);
+
SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
// If this is a fixed-length constant mask, each lane / leaf has to
// use the same mask. This is done by checking if every group with Factor
// number of elements in the interleaved mask has homogeneous values.
for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+ if (!GapMask[Idx % Factor])
+ continue;
Constant *C = ConstMask->getAggregateElement(Idx);
if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
- return nullptr;
+ return {nullptr, GapMask};
LeafMask[Idx / Factor] = C;
}
- return ConstantVector::get(LeafMask);
+ return {ConstantVector::get(LeafMask), GapMask};
}
}
@@ -603,12 +677,13 @@ static Value *getMask(Value *WideMask, unsigned Factor,
auto *LeafMaskTy =
VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
IRBuilder<> Builder(SVI);
- return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
- uint64_t(0));
+ return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+ uint64_t(0)),
+ GapMask};
}
}
- return nullptr;
+ return {nullptr, GapMask};
}
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -639,9 +714,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
+ APInt GapMask(Factor, 0);
+ std::tie(Mask, GapMask) =
+ getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;
+ // We haven't supported gap mask if it's deinterleaving using intrinsics.
+ // Yet it is possible that we already changed the IR, hence returning true
+ // here.
+ if (GapMask.popcount() != Factor)
+ return true;
LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
<< " intrinsic " << *DI << " and factor = "
@@ -680,10 +762,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
II->getIntrinsicID() != Intrinsic::vp_store)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor,
- cast<VectorType>(InterleaveValues[0]->getType()));
+ APInt GapMask(Factor, 0);
+ std::tie(Mask, GapMask) =
+ getMask(getMaskOperand(II), Factor,
+ cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
+ // We haven't supported gap mask if it's interleaving using intrinsics. Yet
+ // it is possible that we already changed the IR, hence returning true here.
+ if (GapMask.popcount() != Factor)
+ return true;
LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
<< " intrinsic " << *IntII << " and factor = "
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 1f23418..c5dfdda 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -213,11 +213,7 @@ void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) {
}
/// FindLastPartialDef - Return the last partial def of the specified register.
-/// Also returns the sub-registers that're defined by the instruction.
-MachineInstr *
-LiveVariables::FindLastPartialDef(Register Reg,
- SmallSet<Register, 4> &PartDefRegs) {
- Register LastDefReg = 0;
+MachineInstr *LiveVariables::FindLastPartialDef(Register Reg) {
unsigned LastDefDist = 0;
MachineInstr *LastDef = nullptr;
for (MCPhysReg SubReg : TRI->subregs(Reg)) {
@@ -226,7 +222,6 @@ LiveVariables::FindLastPartialDef(Register Reg,
continue;
unsigned Dist = DistanceMap[Def];
if (Dist > LastDefDist) {
- LastDefReg = SubReg;
LastDef = Def;
LastDefDist = Dist;
}
@@ -235,14 +230,6 @@ LiveVariables::FindLastPartialDef(Register Reg,
if (!LastDef)
return nullptr;
- PartDefRegs.insert(LastDefReg);
- for (MachineOperand &MO : LastDef->all_defs()) {
- if (MO.getReg() == 0)
- continue;
- Register DefReg = MO.getReg();
- if (TRI->isSubRegister(Reg, DefReg))
- PartDefRegs.insert_range(TRI->subregs_inclusive(DefReg));
- }
return LastDef;
}
@@ -261,27 +248,11 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) {
// ...
// = EAX
// All of the sub-registers must have been defined before the use of Reg!
- SmallSet<Register, 4> PartDefRegs;
- MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+ MachineInstr *LastPartialDef = FindLastPartialDef(Reg);
// If LastPartialDef is NULL, it must be using a livein register.
if (LastPartialDef) {
- LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
- true/*IsImp*/));
- PhysRegDef[Reg.id()] = LastPartialDef;
- SmallSet<MCPhysReg, 8> Processed;
- for (MCPhysReg SubReg : TRI->subregs(Reg)) {
- if (Processed.count(SubReg))
- continue;
- if (PartDefRegs.count(SubReg))
- continue;
- // This part of Reg was defined before the last partial def. It's killed
- // here.
- LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
- false/*IsDef*/,
- true/*IsImp*/));
- PhysRegDef[SubReg] = LastPartialDef;
- Processed.insert_range(TRI->subregs(SubReg));
- }
+ LastPartialDef->addOperand(
+ MachineOperand::CreateReg(Reg, /*IsDef=*/true, /*IsImp=*/true));
}
} else if (LastDef && !PhysRegUse[Reg.id()] &&
!LastDef->findRegisterDefOperand(Reg, /*TRI=*/nullptr))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 3e99e57..bb70e78 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -124,6 +124,11 @@ public:
bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
const yaml::MachineFunction &YamlMF);
+ bool initializeSaveRestorePoints(
+ PerFunctionMIParsingState &PFS,
+ const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+ SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints);
+
bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS,
const yaml::MachineFunction &YamlMF);
@@ -529,7 +534,7 @@ void MIRParserImpl::setupDebugValueTracking(
unsigned MaxInstrNum = 0;
for (auto &MBB : MF)
for (auto &MI : MBB)
- MaxInstrNum = std::max((unsigned)MI.peekDebugInstrNum(), MaxInstrNum);
+ MaxInstrNum = std::max(MI.peekDebugInstrNum(), MaxInstrNum);
MF.setDebugInstrNumberingCount(MaxInstrNum);
// Load any substitutions.
@@ -867,18 +872,14 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
MFI.setHasTailCall(YamlMFI.HasTailCall);
MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid);
MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
- if (!YamlMFI.SavePoint.Value.empty()) {
- MachineBasicBlock *MBB = nullptr;
- if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
- return true;
- MFI.setSavePoint(MBB);
- }
- if (!YamlMFI.RestorePoint.Value.empty()) {
- MachineBasicBlock *MBB = nullptr;
- if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
- return true;
- MFI.setRestorePoint(MBB);
- }
+ SmallVector<MachineBasicBlock *, 4> SavePoints;
+ if (initializeSaveRestorePoints(PFS, YamlMFI.SavePoints, SavePoints))
+ return true;
+ MFI.setSavePoints(SavePoints);
+ SmallVector<MachineBasicBlock *, 4> RestorePoints;
+ if (initializeSaveRestorePoints(PFS, YamlMFI.RestorePoints, RestorePoints))
+ return true;
+ MFI.setRestorePoints(RestorePoints);
std::vector<CalleeSavedInfo> CSIInfo;
// Initialize the fixed frame objects.
@@ -1093,6 +1094,21 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
return false;
}
+// Return true if basic block was incorrectly specified in MIR
+bool MIRParserImpl::initializeSaveRestorePoints(
+ PerFunctionMIParsingState &PFS,
+ const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+ SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints) {
+ MachineBasicBlock *MBB = nullptr;
+ for (const yaml::SaveRestorePointEntry &Entry : YamlSRPoints) {
+ if (parseMBBReference(PFS, MBB, Entry.Point.Value))
+ return true;
+ SaveRestorePoints.push_back(MBB);
+ }
+
+ return false;
+}
+
bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
const yaml::MachineJumpTable &YamlJTI) {
MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index ce1834a..7cc9192 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -150,6 +150,10 @@ static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
const MachineJumpTableInfo &JTI);
static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
const MachineFrameInfo &MFI);
+static void
+convertSRPoints(ModuleSlotTracker &MST,
+ std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+ ArrayRef<MachineBasicBlock *> SaveRestorePoints);
static void convertStackObjects(yaml::MachineFunction &YMF,
const MachineFunction &MF,
ModuleSlotTracker &MST, MFPrintState &State);
@@ -355,14 +359,10 @@ static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
YamlMFI.HasTailCall = MFI.hasTailCall();
YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid();
YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
- if (MFI.getSavePoint()) {
- raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
- StrOS << printMBBReference(*MFI.getSavePoint());
- }
- if (MFI.getRestorePoint()) {
- raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
- StrOS << printMBBReference(*MFI.getRestorePoint());
- }
+ if (!MFI.getSavePoints().empty())
+ convertSRPoints(MST, YamlMFI.SavePoints, MFI.getSavePoints());
+ if (!MFI.getRestorePoints().empty())
+ convertSRPoints(MST, YamlMFI.RestorePoints, MFI.getRestorePoints());
}
static void convertEntryValueObjects(yaml::MachineFunction &YMF,
@@ -616,6 +616,21 @@ static void convertMCP(yaml::MachineFunction &MF,
}
}
+static void
+convertSRPoints(ModuleSlotTracker &MST,
+ std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints,
+ ArrayRef<MachineBasicBlock *> SRPoints) {
+ for (const auto &MBB : SRPoints) {
+ SmallString<16> Str;
+ yaml::SaveRestorePointEntry Entry;
+ raw_svector_ostream StrOS(Str);
+ StrOS << printMBBReference(*MBB);
+ Entry.Point = StrOS.str().str();
+ Str.clear();
+ YamlSRPoints.push_back(Entry);
+ }
+}
+
static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
const MachineJumpTableInfo &JTI) {
YamlJTI.Kind = JTI.getEntryKind();
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 742de11..e359831 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -490,7 +490,7 @@ private:
SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
/// Multimap tracking debug users in current BB
- DenseMap<MachineInstr *, SmallSet<MachineInstr *, 2>> CopyDbgUsers;
+ DenseMap<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> CopyDbgUsers;
CopyTracker Tracker;
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
index 1a20fe5..307f494 100644
--- a/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -87,7 +87,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
// Do this by introducing debug uses of each register definition. If that is
// not possible (e.g. we have a phi or a meta instruction), emit a constant.
uint64_t NextImm = 0;
- SmallSet<DILocalVariable *, 16> VarSet;
+ SmallPtrSet<DILocalVariable *, 16> VarSet;
const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE);
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI();
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index e4b9938..a8306b2 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -244,6 +244,22 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
}
OS << "\n";
}
+ OS << "save/restore points:\n";
+
+ if (!SavePoints.empty()) {
+ OS << "save points:\n";
+
+ for (auto &item : SavePoints)
+ OS << printMBBReference(*item) << "\n";
+ } else
+ OS << "save points are empty\n";
+
+ if (!RestorePoints.empty()) {
+ OS << "restore points:\n";
+ for (auto &item : RestorePoints)
+ OS << printMBBReference(*item) << "\n";
+ } else
+ OS << "restore points are empty\n";
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index ec40f6a..82ba596 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -154,17 +154,17 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
MBB->getParent()->deleteMachineBasicBlock(MBB);
}
-static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
- const Function &F) {
+static inline Align getFnStackAlignment(const TargetSubtargetInfo &STI,
+ const Function &F) {
if (auto MA = F.getFnStackAlign())
return *MA;
- return STI->getFrameLowering()->getStackAlign();
+ return STI.getFrameLowering()->getStackAlign();
}
MachineFunction::MachineFunction(Function &F, const TargetMachine &Target,
const TargetSubtargetInfo &STI, MCContext &Ctx,
unsigned FunctionNum)
- : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
+ : F(F), Target(Target), STI(STI), Ctx(Ctx) {
FunctionNumber = FunctionNum;
init();
}
@@ -195,7 +195,7 @@ void MachineFunction::init() {
// We can realign the stack if the target supports it and the user hasn't
// explicitly asked us not to.
- bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
+ bool CanRealignSP = STI.getFrameLowering()->isStackRealignable() &&
!F.hasFnAttribute("no-realign-stack");
bool ForceRealignSP = F.hasFnAttribute(Attribute::StackAlignment) ||
F.hasFnAttribute("stackrealign");
@@ -209,11 +209,11 @@ void MachineFunction::init() {
FrameInfo->ensureMaxAlignment(*F.getFnStackAlign());
ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
- Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
+ Alignment = STI.getTargetLowering()->getMinFunctionAlignment();
if (!F.getAlign() && !F.hasOptSize())
Alignment = std::max(Alignment,
- STI->getTargetLowering()->getPrefFunctionAlignment());
+ STI.getTargetLowering()->getPrefFunctionAlignment());
// -fsanitize=function and -fsanitize=kcfi instrument indirect function calls
// to load a type hash before the function label. Ensure functions are aligned
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 4da0184..d9e8484 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -94,6 +94,22 @@ static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
return DebugLoc();
}
+/// Check if target reg is contained in given lists, which are:
+/// LocalDefsV as given list for virtual regs
+/// LocalDefsP as given list for physical regs, in BitVector[RegUnit] form
+static bool containsReg(SmallSetVector<Register, 32> LocalDefsV,
+ const BitVector &LocalDefsP, Register Reg,
+ const TargetRegisterInfo *TRI) {
+ if (Reg.isPhysical()) {
+ for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+ if (!LocalDefsP[Unit])
+ return false;
+
+ return true;
+ }
+ return LocalDefsV.contains(Reg);
+}
+
/// finalizeBundle - Finalize a machine instruction bundle which includes
/// a sequence of instructions starting from FirstMI to LastMI (exclusive).
/// This routine adds a BUNDLE instruction to represent the bundle, it adds
@@ -115,6 +131,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
Bundle.prepend(MIB);
SmallSetVector<Register, 32> LocalDefs;
+ BitVector LocalDefsP(TRI->getNumRegUnits());
SmallSet<Register, 8> DeadDefSet;
SmallSet<Register, 16> KilledDefSet;
SmallSetVector<Register, 8> ExternUses;
@@ -130,7 +147,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
if (!Reg)
continue;
- if (LocalDefs.contains(Reg)) {
+ if (containsReg(LocalDefs, LocalDefsP, Reg, TRI)) {
MO.setIsInternalRead();
if (MO.isKill()) {
// Internal def is now killed.
@@ -165,8 +182,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
}
}
- if (!MO.isDead() && Reg.isPhysical())
- LocalDefs.insert_range(TRI->subregs(Reg));
+ if (!MO.isDead() && Reg.isPhysical()) {
+ for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+ LocalDefsP.set(Unit);
+ }
}
// Set FrameSetup/FrameDestroy for the bundle. If any of the instructions
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 90005bd..3a9651c 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -3466,9 +3466,9 @@ bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
}
/// Determine transitive dependences of unpipelineable instructions
-SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+SmallPtrSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
- SmallSet<SUnit *, 8> DoNotPipeline;
+ SmallPtrSet<SUnit *, 8> DoNotPipeline;
SmallVector<SUnit *, 8> Worklist;
for (auto &SU : SSD->SUnits)
@@ -3498,7 +3498,7 @@ SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
// and ensure that they are in stage 0. If unable to do so, return false.
bool SMSchedule::normalizeNonPipelinedInstructions(
SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
- SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+ SmallPtrSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
int NewLastCycle = INT_MIN;
for (SUnit &SU : SSD->SUnits) {
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index 975a3fe..1db5301 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -79,7 +79,7 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
FirstSU.ParentClusterIdx = Clusters.size();
SecondSU.ParentClusterIdx = Clusters.size();
- SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
+ SmallPtrSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
Clusters.push_back(Cluster);
// TODO - If we want to chain more than two instructions, we need to create
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index a93a89e..34a9d5d 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -72,6 +73,7 @@ class PHIEliminationImpl {
LiveIntervals *LIS = nullptr;
MachineLoopInfo *MLI = nullptr;
MachineDominatorTree *MDT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
/// in predecessor basic blocks.
@@ -123,17 +125,22 @@ public:
auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
auto *MDTWrapper =
P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+ auto *PDTWrapper =
+ P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr;
MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+ PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
}
PHIEliminationImpl(MachineFunction &MF, MachineFunctionAnalysisManager &AM)
: LV(AM.getCachedResult<LiveVariablesAnalysis>(MF)),
LIS(AM.getCachedResult<LiveIntervalsAnalysis>(MF)),
MLI(AM.getCachedResult<MachineLoopAnalysis>(MF)),
- MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)), MFAM(&AM) {}
+ MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)),
+ PDT(AM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF)),
+ MFAM(&AM) {}
bool run(MachineFunction &MF);
};
@@ -172,6 +179,7 @@ PHIEliminationPass::run(MachineFunction &MF,
PA.preserve<LiveVariablesAnalysis>();
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachinePostDominatorTreeAnalysis>();
PA.preserve<MachineLoopAnalysis>();
return PA;
}
@@ -197,6 +205,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
AU.addPreserved<MachineLoopInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -204,15 +213,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
bool PHIEliminationImpl::run(MachineFunction &MF) {
MRI = &MF.getRegInfo();
- MachineDominatorTree *MDT = nullptr;
- if (P) {
- auto *MDTWrapper =
- P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
- MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
- } else {
- MDT = MFAM->getCachedResult<MachineDominatorTreeAnalysis>(MF);
- }
- MachineDomTreeUpdater MDTU(MDT, MachineDomTreeUpdater::UpdateStrategy::Lazy);
+ MachineDomTreeUpdater MDTU(MDT, PDT,
+ MachineDomTreeUpdater::UpdateStrategy::Lazy);
bool Changed = false;
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8de2c48..96c9cde6 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -145,7 +145,7 @@ static bool lowerObjCCall(Function &F, RTLIB::LibcallImpl NewFn,
// FIXME: When RuntimeLibcalls is an analysis, check if the function is really
// supported, and go through RTLIB::Libcall.
- const char *NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
+ StringRef NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
// If we haven't already looked up this function, check to see if the
// program already contains a function with this name.
@@ -587,12 +587,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
break;
case Intrinsic::exp:
case Intrinsic::exp2:
+ case Intrinsic::log:
Changed |= forEachCall(F, [&](CallInst *CI) {
Type *Ty = CI->getArgOperand(0)->getType();
if (!isa<ScalableVectorType>(Ty))
return false;
const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
+ assert(Op != ISD::DELETED_NODE && "unsupported intrinsic");
if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
return false;
return lowerUnaryVectorIntrinsicAsLoop(M, CI);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index f66f546..8fc0748 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -351,8 +351,8 @@ bool PEIImpl::run(MachineFunction &MF) {
delete RS;
SaveBlocks.clear();
RestoreBlocks.clear();
- MFI.setSavePoint(nullptr);
- MFI.setRestorePoint(nullptr);
+ MFI.setSavePoints({});
+ MFI.setRestorePoints({});
return true;
}
@@ -423,16 +423,18 @@ void PEIImpl::calculateCallFrameInfo(MachineFunction &MF) {
/// callee-saved registers, and placing prolog and epilog code.
void PEIImpl::calculateSaveRestoreBlocks(MachineFunction &MF) {
const MachineFrameInfo &MFI = MF.getFrameInfo();
-
// Even when we do not change any CSR, we still want to insert the
// prologue and epilogue of the function.
// So set the save points for those.
// Use the points found by shrink-wrapping, if any.
- if (MFI.getSavePoint()) {
- SaveBlocks.push_back(MFI.getSavePoint());
- assert(MFI.getRestorePoint() && "Both restore and save must be set");
- MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ if (!MFI.getSavePoints().empty()) {
+ assert(MFI.getSavePoints().size() == 1 &&
+ "Multiple save points are not yet supported!");
+ SaveBlocks.push_back(MFI.getSavePoints().front());
+ assert(MFI.getRestorePoints().size() == 1 &&
+ "Multiple restore points are not yet supported!");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
// If RestoreBlock does not have any successor and is not a return block
// then the end point is unreachable and we do not need to insert any
// epilogue.
@@ -558,7 +560,11 @@ static void updateLiveness(MachineFunction &MF) {
SmallPtrSet<MachineBasicBlock *, 8> Visited;
SmallVector<MachineBasicBlock *, 8> WorkList;
MachineBasicBlock *Entry = &MF.front();
- MachineBasicBlock *Save = MFI.getSavePoint();
+
+ assert(MFI.getSavePoints().size() < 2 &&
+ "Multiple save points not yet supported!");
+ MachineBasicBlock *Save =
+ MFI.getSavePoints().empty() ? nullptr : MFI.getSavePoints().front();
if (!Save)
Save = Entry;
@@ -569,7 +575,10 @@ static void updateLiveness(MachineFunction &MF) {
}
Visited.insert(Save);
- MachineBasicBlock *Restore = MFI.getRestorePoint();
+ assert(MFI.getRestorePoints().size() < 2 &&
+ "Multiple restore points not yet supported!");
+ MachineBasicBlock *Restore =
+ MFI.getRestorePoints().empty() ? nullptr : MFI.getRestorePoints().front();
if (Restore)
// By construction Restore cannot be visited, otherwise it
// means there exists a path to Restore that does not go
@@ -1550,7 +1559,7 @@ void PEIImpl::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
// If this instruction has a FrameIndex operand, we need to
// use that target machine register info object to eliminate
// it.
- TRI.eliminateFrameIndex(MI, SPAdj, i);
+ TRI.eliminateFrameIndex(MI, SPAdj, i, RS);
// Reset the iterator if we were at the beginning of the BB.
if (AtBeginning) {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 66a206c..804480c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -474,6 +474,13 @@ int RegAllocFastImpl::getStackSpaceFor(Register VirtReg) {
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
unsigned Size = TRI->getSpillSize(RC);
Align Alignment = TRI->getSpillAlign(RC);
+
+ const MachineFunction &MF = MRI->getMF();
+ auto &ST = MF.getSubtarget();
+ Align CurrentAlign = ST.getFrameLowering()->getStackAlign();
+ if (Alignment > CurrentAlign && !TRI->canRealignStack(MF))
+ Alignment = CurrentAlign;
+
int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
// Assign the slot.
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index ca51b67..5f37890 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
- int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc();
if (PDiff > 0) {
Delta.CriticalMax = PressureChange(i);
Delta.CriticalMax.setUnitInc(PDiff);
@@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
- int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc();
if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
Delta.CriticalMax = PressureChange(PSetID);
Delta.CriticalMax.setUnitInc(CritInc);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7341914..8446045 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -401,7 +401,7 @@ namespace {
SDValue PromoteExtend(SDValue Op);
bool PromoteLoad(SDValue Op);
- SDValue foldShiftToAvg(SDNode *N);
+ SDValue foldShiftToAvg(SDNode *N, const SDLoc &DL);
// Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)`
SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT);
@@ -10983,7 +10983,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
if (SDValue NarrowLoad = reduceLoadWidth(N))
return NarrowLoad;
- if (SDValue AVG = foldShiftToAvg(N))
+ if (SDValue AVG = foldShiftToAvg(N, DL))
return AVG;
return SDValue();
@@ -11256,7 +11256,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI))
return MULH;
- if (SDValue AVG = foldShiftToAvg(N))
+ if (SDValue AVG = foldShiftToAvg(N, DL))
return AVG;
return SDValue();
@@ -11772,51 +11772,36 @@ static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
}
}
-SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
+// Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y)
+SDValue DAGCombiner::foldShiftToAvg(SDNode *N, const SDLoc &DL) {
const unsigned Opcode = N->getOpcode();
-
- // Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y)
if (Opcode != ISD::SRA && Opcode != ISD::SRL)
return SDValue();
- unsigned FloorISD = 0;
- auto VT = N->getValueType(0);
- bool IsUnsigned = false;
-
- // Decide wether signed or unsigned.
- switch (Opcode) {
- case ISD::SRA:
- if (!hasOperation(ISD::AVGFLOORS, VT))
- return SDValue();
- FloorISD = ISD::AVGFLOORS;
- break;
- case ISD::SRL:
- IsUnsigned = true;
- if (!hasOperation(ISD::AVGFLOORU, VT))
- return SDValue();
- FloorISD = ISD::AVGFLOORU;
- break;
- default:
- return SDValue();
- }
+ EVT VT = N->getValueType(0);
+ bool IsUnsigned = Opcode == ISD::SRL;
// Captured values.
SDValue A, B, Add;
// Match floor average as it is common to both floor/ceil avgs.
- if (!sd_match(N, m_BinOp(Opcode,
- m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))),
- m_One())))
- return SDValue();
+ if (sd_match(N, m_BinOp(Opcode,
+ m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))),
+ m_One()))) {
+ // Decide whether signed or unsigned.
+ unsigned FloorISD = IsUnsigned ? ISD::AVGFLOORU : ISD::AVGFLOORS;
+ if (!hasOperation(FloorISD, VT))
+ return SDValue();
- // Can't optimize adds that may wrap.
- if (IsUnsigned && !Add->getFlags().hasNoUnsignedWrap())
- return SDValue();
+ // Can't optimize adds that may wrap.
+ if ((IsUnsigned && !Add->getFlags().hasNoUnsignedWrap()) ||
+ (!IsUnsigned && !Add->getFlags().hasNoSignedWrap()))
+ return SDValue();
- if (!IsUnsigned && !Add->getFlags().hasNoSignedWrap())
- return SDValue();
+ return DAG.getNode(FloorISD, DL, N->getValueType(0), {A, B});
+ }
- return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
+ return SDValue();
}
SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) {
@@ -12843,22 +12828,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
SDLoc DL(HG);
EVT MemVT = HG->getMemoryVT();
+ EVT DataVT = Index.getValueType();
MachineMemOperand *MMO = HG->getMemOperand();
ISD::MemIndexType IndexType = HG->getIndexType();
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
return Chain;
- SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
- HG->getScale(), HG->getIntID()};
- if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+ if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+ refineIndexType(Index, IndexType, DataVT, DAG)) {
+ SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
+ HG->getScale(), HG->getIntID()};
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
MMO, IndexType);
+ }
- EVT DataVT = Index.getValueType();
- if (refineIndexType(Index, IndexType, DataVT, DAG))
- return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
- MMO, IndexType);
return SDValue();
}
@@ -16343,6 +16327,42 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
DAG, DL);
}
break;
+ case ISD::AVGFLOORS:
+ case ISD::AVGFLOORU:
+ case ISD::AVGCEILS:
+ case ISD::AVGCEILU:
+ case ISD::ABDS:
+ case ISD::ABDU:
+ // (trunc (avg a, b)) -> (avg (trunc a), (trunc b))
+ // (trunc (abdu/abds a, b)) -> (abdu/abds (trunc a), (trunc b))
+ if (!LegalOperations && N0.hasOneUse() &&
+ TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ EVT TruncVT = VT;
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned TruncBits = TruncVT.getScalarSizeInBits();
+
+ SDValue A = N0.getOperand(0);
+ SDValue B = N0.getOperand(1);
+ bool CanFold = false;
+
+ if (N0.getOpcode() == ISD::AVGFLOORU || N0.getOpcode() == ISD::AVGCEILU ||
+ N0.getOpcode() == ISD::ABDU) {
+ APInt UpperBits = APInt::getBitsSetFrom(SrcBits, TruncBits);
+ CanFold = DAG.MaskedValueIsZero(B, UpperBits) &&
+ DAG.MaskedValueIsZero(A, UpperBits);
+ } else {
+ unsigned NeededBits = SrcBits - TruncBits;
+ CanFold = DAG.ComputeNumSignBits(B) > NeededBits &&
+ DAG.ComputeNumSignBits(A) > NeededBits;
+ }
+
+ if (CanFold) {
+ SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A);
+ SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B);
+ return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB);
+ }
+ }
+ break;
}
return SDValue();
@@ -25987,7 +26007,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
// Combine an extract of an extract into a single extract_subvector.
// ext (ext X, C), 0 --> ext X, C
if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
- if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
+ // The index has to be a multiple of the new result type's known minimum
+ // vector length.
+ if (V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 &&
+ TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
V.getConstantOperandVal(1)) &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0),
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index fb9eff9..9467ba1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -729,9 +729,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
- ArgListEntry Entry;
- Entry.Val = V;
- Entry.Ty = V->getType();
+ ArgListEntry Entry(V);
Entry.setAttributes(CI, ArgI);
Args.push_back(Entry);
}
@@ -978,9 +976,7 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
- ArgListEntry Entry;
- Entry.Val = V;
- Entry.Ty = V->getType();
+ ArgListEntry Entry(V);
Entry.setAttributes(CI, ArgI);
Args.push_back(Entry);
}
@@ -1012,17 +1008,16 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
for (unsigned i = 0; i != NumRegs; ++i) {
- ISD::InputArg MyFlags;
- MyFlags.VT = RegisterVT;
- MyFlags.ArgVT = VT;
- MyFlags.Used = CLI.IsReturnValueUsed;
+ ISD::ArgFlagsTy Flags;
if (CLI.RetSExt)
- MyFlags.Flags.setSExt();
+ Flags.setSExt();
if (CLI.RetZExt)
- MyFlags.Flags.setZExt();
+ Flags.setZExt();
if (CLI.IsInReg)
- MyFlags.Flags.setInReg();
- CLI.Ins.push_back(MyFlags);
+ Flags.setInReg();
+ ISD::InputArg Ret(Flags, RegisterVT, VT, CLI.RetTy, CLI.IsReturnValueUsed,
+ ISD::InputArg::NoArgIndex, 0);
+ CLI.Ins.push_back(Ret);
}
}
@@ -1117,7 +1112,6 @@ bool FastISel::lowerCall(const CallInst *CI) {
Type *RetTy = CI->getType();
ArgListTy Args;
- ArgListEntry Entry;
Args.reserve(CI->arg_size());
for (auto i = CI->arg_begin(), e = CI->arg_end(); i != e; ++i) {
@@ -1127,9 +1121,7 @@ bool FastISel::lowerCall(const CallInst *CI) {
if (V->getType()->isEmptyTy())
continue;
- Entry.Val = V;
- Entry.Ty = V->getType();
-
+ ArgListEntry Entry(V);
// Skip the first return-type Attribute to get to params.
Entry.setAttributes(CI, i - CI->arg_begin());
Args.push_back(Entry);
@@ -1148,9 +1140,12 @@ bool FastISel::lowerCall(const CallInst *CI) {
CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI)
.setTailCall(IsTailCall);
- diagnoseDontCall(*CI);
+ if (lowerCallTo(CLI)) {
+ diagnoseDontCall(*CI);
+ return true;
+ }
- return lowerCallTo(CLI);
+ return false;
}
bool FastISel::selectCall(const User *I) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 8c8daef..1a63518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -81,12 +81,11 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
/// implicit physical register output.
-void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
- Register SrcReg, VRBaseMapType &VRBaseMap) {
+void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+ VRBaseMapType &VRBaseMap) {
Register VRBase;
if (SrcReg.isVirtual()) {
// Just use the input register directly!
- SDValue Op(Node, ResNo);
if (IsClone)
VRBaseMap.erase(Op);
bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
@@ -99,17 +98,15 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
// the CopyToReg'd destination register instead of creating a new vreg.
bool MatchReg = true;
const TargetRegisterClass *UseRC = nullptr;
- MVT VT = Node->getSimpleValueType(ResNo);
+ MVT VT = Op.getSimpleValueType();
// Stick to the preferred register classes for legal types.
if (TLI->isTypeLegal(VT))
- UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
+ UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
- for (SDNode *User : Node->users()) {
+ for (SDNode *User : Op->users()) {
bool Match = true;
- if (User->getOpcode() == ISD::CopyToReg &&
- User->getOperand(2).getNode() == Node &&
- User->getOperand(2).getResNo() == ResNo) {
+ if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2) == Op) {
Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
if (DestReg.isVirtual()) {
VRBase = DestReg;
@@ -118,10 +115,8 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
Match = false;
} else {
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
- SDValue Op = User->getOperand(i);
- if (Op.getNode() != Node || Op.getResNo() != ResNo)
+ if (User->getOperand(i) != Op)
continue;
- MVT VT = Node->getSimpleValueType(Op.getResNo());
if (VT == MVT::Other || VT == MVT::Glue)
continue;
Match = false;
@@ -170,11 +165,11 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
} else {
// Create the reg, emit the copy.
VRBase = MRI->createVirtualRegister(DstRC);
- BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
- VRBase).addReg(SrcReg);
+ BuildMI(*MBB, InsertPos, Op.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ VRBase)
+ .addReg(SrcReg);
}
- SDValue Op(Node, ResNo);
if (IsClone)
VRBaseMap.erase(Op);
bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
@@ -1170,7 +1165,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
continue;
// This implicitly defined physreg has a use.
UsedRegs.push_back(Reg);
- EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap);
+ EmitCopyFromReg(SDValue(Node, i), IsClone, Reg, VRBaseMap);
}
}
@@ -1178,7 +1173,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
if (F->getOpcode() == ISD::CopyFromReg) {
- UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+ Register Reg = cast<RegisterSDNode>(F->getOperand(1))->getReg();
+ if (Reg.isPhysical())
+ UsedRegs.push_back(Reg);
continue;
} else if (F->getOpcode() == ISD::CopyToReg) {
// Skip CopyToReg nodes that are internal to the glue chain.
@@ -1281,7 +1278,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
}
case ISD::CopyFromReg: {
Register SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
- EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap);
+ EmitCopyFromReg(SDValue(Node, 0), IsClone, SrcReg, VRBaseMap);
break;
}
case ISD::EH_LABEL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 16d754c..b465de8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -48,8 +48,8 @@ private:
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
/// implicit physical register output.
- void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
- Register SrcReg, VRBaseMapType &VRBaseMap);
+ void EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+ VRBaseMapType &VRBaseMap);
void CreateVirtualRegisters(SDNode *Node,
MachineInstrBuilder &MIB,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ba0ab23..bcfc2c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2181,12 +2181,10 @@ SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
bool isSigned) {
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (const SDValue &Op : Node->op_values()) {
EVT ArgVT = Op.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = Op;
- Entry.Ty = ArgTy;
+ TargetLowering::ArgListEntry Entry(Op, ArgTy);
Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, isSigned);
Entry.IsZExt = !Entry.IsSExt;
Args.push_back(Entry);
@@ -2325,11 +2323,9 @@ SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
EVT IntVT =
EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
- TargetLowering::ArgListEntry Arg;
EVT ArgVT = Op.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Arg.Node = Op;
- Arg.Ty = ArgTy;
+ TargetLowering::ArgListEntry Arg(Op, ArgTy);
Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, /*IsSigned=*/false);
Arg.IsZExt = !Arg.IsSExt;
@@ -2370,12 +2366,10 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (const SDValue &Op : Node->op_values()) {
EVT ArgVT = Op.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = Op;
- Entry.Ty = ArgTy;
+ TargetLowering::ArgListEntry Entry(Op, ArgTy);
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);
@@ -2383,8 +2377,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
// Also pass the return address of the remainder.
SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
- Entry.Node = FIPtr;
- Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+ TargetLowering::ArgListEntry Entry(
+ FIPtr, PointerType::getUnqual(RetTy->getContext()));
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2cad36e..83bb1df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -197,7 +197,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) {
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpVT = N->getOperand(0 + Offset).getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
CallOptions, SDLoc(N),
Chain);
@@ -218,7 +218,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) {
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
N->getOperand(1 + Offset).getValueType() };
- CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
CallOptions, SDLoc(N),
Chain);
@@ -558,7 +558,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(),
N->getOperand(1 + Offset).getValueType(),
N->getOperand(2 + Offset).getValueType() };
- CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG,
GetFPLibCall(N->getValueType(0),
RTLIB::FMA_F32,
@@ -642,7 +642,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
CallOptions, SDLoc(N),
Chain);
@@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
SDValue Op = N->getOperand(0);
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpsVT[1] = { N->getOperand(0).getValueType() };
- CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
CallOptions, SDLoc(N)).first;
if (N->getValueType(0) == MVT::f32)
@@ -694,7 +694,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
CallOptions, SDLoc(N),
Chain);
@@ -742,7 +742,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) {
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
N->getOperand(1 + Offset).getValueType() };
- CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
CallOptions, SDLoc(N),
Chain);
@@ -779,7 +779,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
// TODO: setTypeListBeforeSoften can't properly express multiple return types,
// but we only really need to handle the 0th one for softening anyway.
- CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true)
+ CallOptions.setTypeListBeforeSoften({OpsVT}, VT0)
.setOpsTypeOverrides(CallOpsTypeOverrides);
auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL,
@@ -828,7 +828,7 @@ bool DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(
TargetLowering::MakeLibCallOptions CallOptions;
// TODO: setTypeListBeforeSoften can't properly express multiple return types,
// but since both returns have the same type it should be okay.
- CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true)
+ CallOptions.setTypeListBeforeSoften({OpsVT}, VT)
.setOpsTypeOverrides(CallOpsTypeOverrides);
auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, DL,
@@ -1100,7 +1100,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
NVT, N->getOperand(IsStrict ? 1 : 0));
TargetLowering::MakeLibCallOptions CallOptions;
CallOptions.setIsSigned(Signed);
- CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+ CallOptions.setTypeListBeforeSoften(SVT, RVT);
std::pair<SDValue, SDValue> Tmp =
TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
Op, CallOptions, dl, Chain);
@@ -1222,7 +1222,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
Op = GetSoftenedFloat(Op);
TargetLowering::MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+ CallOptions.setTypeListBeforeSoften(SVT, RVT);
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
CallOptions, SDLoc(N),
Chain);
@@ -1298,7 +1298,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
Op = GetSoftenedFloat(Op);
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
TargetLowering::MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+ CallOptions.setTypeListBeforeSoften(SVT, RVT);
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
CallOptions, dl, Chain);
@@ -1453,7 +1453,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) {
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpVT = N->getOperand(0 + Offset).getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
CallOptions, SDLoc(N),
Chain);
@@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break;
case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+ case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break;
case ISD::FABS: ExpandFloatRes_FABS(N, Lo, Hi); break;
case ISD::STRICT_FMINNUM:
case ISD::FMINNUM: ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
@@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
}
+void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves.
+ SDLoc dl(N);
+ GetExpandedFloat(N->getOperand(0), Lo, Hi);
+}
+
void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
SDValue &Hi) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -3559,7 +3567,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
Op = GetSoftenedFloat(Op);
TargetLowering::MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+ CallOptions.setTypeListBeforeSoften(SVT, RVT);
std::pair<SDValue, SDValue> Tmp =
TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N), Chain);
if (IsStrict)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a5bd97a..90d62e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5260,20 +5260,18 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
MachinePointerInfo());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (const SDValue &Op : N->op_values()) {
EVT ArgVT = Op.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = Op;
- Entry.Ty = ArgTy;
+ TargetLowering::ArgListEntry Entry(Op, ArgTy);
Entry.IsSExt = true;
Entry.IsZExt = false;
Args.push_back(Entry);
}
// Also pass the address of the overflow check.
- Entry.Node = Temp;
- Entry.Ty = PointerType::getUnqual(PtrTy->getContext());
+ TargetLowering::ArgListEntry Entry(
+ Temp, PointerType::getUnqual(PtrTy->getContext()));
Entry.IsSExt = true;
Entry.IsZExt = false;
Args.push_back(Entry);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 63544e6..33fa301 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -681,6 +681,7 @@ private:
SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
// clang-format off
+ void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FACOS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FASIN (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index d2ecc133..2ca9895 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -2223,17 +2223,13 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC,
SDLoc DL(Node);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
unsigned OpNum = 0;
for (auto &VFParam : OptVFInfo->Shape.Parameters) {
if (VFParam.ParamKind == VFParamKind::GlobalPredicate) {
EVT MaskVT = TLI.getSetCCResultType(DAG.getDataLayout(), *Ctx, VT);
- Entry.Node = DAG.getBoolConstant(true, DL, MaskVT, VT);
- Entry.Ty = MaskVT.getTypeForEVT(*Ctx);
- Args.push_back(Entry);
+ Args.emplace_back(DAG.getBoolConstant(true, DL, MaskVT, VT),
+ MaskVT.getTypeForEVT(*Ctx));
continue;
}
@@ -2241,9 +2237,7 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC,
if (VFParam.ParamKind != VFParamKind::Vector)
return false;
- Entry.Node = Node->getOperand(OpNum++);
- Entry.Ty = Ty;
- Args.push_back(Entry);
+ Args.emplace_back(Node->getOperand(OpNum++), Ty);
}
// Emit a call to the vector function.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b9e72c9..23102d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1371,7 +1371,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
const TargetLibraryInfo *LibraryInfo,
UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
- FunctionVarLocs const *VarLocs, bool HasDivergency) {
+ FunctionVarLocs const *VarLocs) {
MF = &NewMF;
SDAGISelPass = PassPtr;
ORE = &NewORE;
@@ -1384,7 +1384,6 @@ void SelectionDAG::init(MachineFunction &NewMF,
BFI = BFIin;
MMI = &MMIin;
FnVarLocs = VarLocs;
- DivergentTarget = HasDivergency;
}
SelectionDAG::~SelectionDAG() {
@@ -2331,8 +2330,7 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) {
return SDValue(E, 0);
auto *N = newSDNode<RegisterSDNode>(Reg, VTs);
- N->SDNodeBits.IsDivergent =
- DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
+ N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
@@ -2578,18 +2576,12 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
}
TargetLowering::ArgListTy Args;
- auto AddArgListEntry = [&](SDValue Node, Type *Ty) {
- TargetLowering::ArgListEntry Entry{};
- Entry.Ty = Ty;
- Entry.Node = Node;
- Args.push_back(Entry);
- };
// Pass the arguments.
for (const SDValue &Op : Node->op_values()) {
EVT ArgVT = Op.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
- AddArgListEntry(Op, ArgTy);
+ Args.emplace_back(Op, ArgTy);
}
// Pass the output pointers.
@@ -2601,7 +2593,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
EVT ResVT = Node->getValueType(ResNo);
SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT);
ResultPtrs[ResNo] = ResultPtr;
- AddArgListEntry(ResultPtr, PointerTy);
+ Args.emplace_back(ResultPtr, PointerTy);
}
SDLoc DL(Node);
@@ -2610,7 +2602,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
if (VD && VD->isMasked()) {
EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
SDValue Mask = getBoolConstant(true, DL, MaskVT, VT);
- AddArgListEntry(Mask, MaskVT.getTypeForEVT(Ctx));
+ Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx));
}
Type *RetType = CallRetResNo.has_value()
@@ -5462,6 +5454,83 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
}
return true;
+ case ISD::EXTRACT_SUBVECTOR: {
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType().isScalableVector())
+ break;
+ uint64_t Idx = Op.getConstantOperandVal(1);
+ unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+ APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+ return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+ Depth + 1);
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ if (Op.getValueType().isScalableVector())
+ break;
+ SDValue Src = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ uint64_t Idx = Op.getConstantOperandVal(2);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+ APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+ APInt DemandedSrcElts = DemandedElts;
+ DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+
+ if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison(
+ Sub, DemandedSubElts, PoisonOnly, Depth + 1))
+ return false;
+ if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison(
+ Src, DemandedSrcElts, PoisonOnly, Depth + 1))
+ return false;
+ return true;
+ }
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ SDValue Src = Op.getOperand(0);
+ auto *IndexC = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isFixedLengthVector() && IndexC &&
+ IndexC->getAPIntValue().ult(SrcVT.getVectorNumElements())) {
+ APInt DemandedSrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
+ IndexC->getZExtValue());
+ return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+ Depth + 1);
+ }
+ break;
+ }
+
+ case ISD::INSERT_VECTOR_ELT: {
+ SDValue InVec = Op.getOperand(0);
+ SDValue InVal = Op.getOperand(1);
+ SDValue EltNo = Op.getOperand(2);
+ EVT VT = InVec.getValueType();
+ auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+ if (IndexC && VT.isFixedLengthVector() &&
+ IndexC->getAPIntValue().ult(VT.getVectorNumElements())) {
+ if (DemandedElts[IndexC->getZExtValue()] &&
+ !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1))
+ return false;
+ APInt InVecDemandedElts = DemandedElts;
+ InVecDemandedElts.clearBit(IndexC->getZExtValue());
+ if (!!InVecDemandedElts &&
+ !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts,
+ PoisonOnly, Depth + 1))
+ return false;
+ return true;
+ }
+ break;
+ }
+
+ case ISD::SCALAR_TO_VECTOR:
+ // Check upper (known undef) elements.
+ if (DemandedElts.ugt(1) && !PoisonOnly)
+ return false;
+ // Check element zero.
+ if (DemandedElts[0] && !isGuaranteedNotToBeUndefOrPoison(
+ Op.getOperand(0), PoisonOnly, Depth + 1))
+ return false;
+ return true;
+
case ISD::SPLAT_VECTOR:
return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
Depth + 1);
@@ -5484,6 +5553,52 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
return true;
}
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ // Shift amount operand is checked by canCreateUndefOrPoison. So it is
+ // enough to check operand 0 if Op can't create undef/poison.
+ return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+ /*ConsiderFlags*/ true, Depth) &&
+ isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts,
+ PoisonOnly, Depth + 1);
+
+ case ISD::BSWAP:
+ case ISD::CTPOP:
+ case ISD::BITREVERSE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::SSHLSAT:
+ case ISD::USHLSAT:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::TRUNCATE:
+ case ISD::VSELECT: {
+ // If Op can't create undef/poison and none of its operands are undef/poison
+ // then Op is never undef/poison. A difference from the more common check
+ // below, outside the switch, is that we handle elementwise operations for
+ // which the DemandedElts mask is valid for all operands here.
+ return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+ /*ConsiderFlags*/ true, Depth) &&
+ all_of(Op->ops(), [&](SDValue V) {
+ return isGuaranteedNotToBeUndefOrPoison(V, DemandedElts,
+ PoisonOnly, Depth + 1);
+ });
+ }
+
// TODO: Search for noundef attributes from library functions.
// TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
@@ -5549,8 +5664,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::ABDS:
case ISD::SMIN:
case ISD::SMAX:
+ case ISD::SCMP:
case ISD::UMIN:
case ISD::UMAX:
+ case ISD::UCMP:
case ISD::AND:
case ISD::XOR:
case ISD::ROTL:
@@ -5630,7 +5747,11 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FDIV:
case ISD::FREM:
case ISD::FCOPYSIGN:
+ case ISD::FMA:
+ case ISD::FMAD:
case ISD::FP_EXTEND:
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT:
// No poison except from flags (which is handled above)
return false;
@@ -8896,18 +9017,11 @@ SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
if (!LibCallName)
return {};
- // Emit a library call.
- auto GetEntry = [](Type *Ty, SDValue &SDV) {
- TargetLowering::ArgListEntry E;
- E.Ty = Ty;
- E.Node = SDV;
- return E;
- };
-
PointerType *PT = PointerType::getUnqual(*getContext());
TargetLowering::ArgListTy Args = {
- GetEntry(PT, Mem0), GetEntry(PT, Mem1),
- GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)};
+ {Mem0, PT},
+ {Mem1, PT},
+ {Size, getDataLayout().getIntPtrType(*getContext())}};
TargetLowering::CallLoweringInfo CLI(*this);
bool IsTailCall = false;
@@ -8978,13 +9092,10 @@ SDValue SelectionDAG::getMemcpy(
// Emit a library call.
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = PointerType::getUnqual(*getContext());
- Entry.Node = Dst; Args.push_back(Entry);
- Entry.Node = Src; Args.push_back(Entry);
-
- Entry.Ty = getDataLayout().getIntPtrType(*getContext());
- Entry.Node = Size; Args.push_back(Entry);
+ Type *PtrTy = PointerType::getUnqual(*getContext());
+ Args.emplace_back(Dst, PtrTy);
+ Args.emplace_back(Src, PtrTy);
+ Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext()));
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
bool IsTailCall = false;
@@ -9022,17 +9133,10 @@ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
MachinePointerInfo SrcPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = getDataLayout().getIntPtrType(*getContext());
- Entry.Node = Dst;
- Args.push_back(Entry);
-
- Entry.Node = Src;
- Args.push_back(Entry);
-
- Entry.Ty = SizeTy;
- Entry.Node = Size;
- Args.push_back(Entry);
+ Type *ArgTy = getDataLayout().getIntPtrType(*getContext());
+ Args.emplace_back(Dst, ArgTy);
+ Args.emplace_back(Src, ArgTy);
+ Args.emplace_back(Size, SizeTy);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -9095,13 +9199,10 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
// Emit a library call.
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = PointerType::getUnqual(*getContext());
- Entry.Node = Dst; Args.push_back(Entry);
- Entry.Node = Src; Args.push_back(Entry);
-
- Entry.Ty = getDataLayout().getIntPtrType(*getContext());
- Entry.Node = Size; Args.push_back(Entry);
+ Type *PtrTy = PointerType::getUnqual(*getContext());
+ Args.emplace_back(Dst, PtrTy);
+ Args.emplace_back(Src, PtrTy);
+ Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext()));
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
@@ -9139,17 +9240,10 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
MachinePointerInfo SrcPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = getDataLayout().getIntPtrType(*getContext());
- Entry.Node = Dst;
- Args.push_back(Entry);
-
- Entry.Node = Src;
- Args.push_back(Entry);
-
- Entry.Ty = SizeTy;
- Entry.Node = Size;
- Args.push_back(Entry);
+ Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
+ Args.emplace_back(Dst, IntPtrTy);
+ Args.emplace_back(Src, IntPtrTy);
+ Args.emplace_back(Size, SizeTy);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -9226,28 +9320,20 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
const char *BzeroName = getTargetLoweringInfo().getLibcallName(RTLIB::BZERO);
- // Helper function to create an Entry from Node and Type.
- const auto CreateEntry = [](SDValue Node, Type *Ty) {
- TargetLowering::ArgListEntry Entry;
- Entry.Node = Node;
- Entry.Ty = Ty;
- return Entry;
- };
-
bool UseBZero = isNullConstant(Src) && BzeroName;
// If zeroing out and bzero is present, use it.
if (UseBZero) {
TargetLowering::ArgListTy Args;
- Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx)));
- Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+ Args.emplace_back(Dst, PointerType::getUnqual(Ctx));
+ Args.emplace_back(Size, DL.getIntPtrType(Ctx));
CLI.setLibCallee(
TLI->getLibcallCallingConv(RTLIB::BZERO), Type::getVoidTy(Ctx),
getExternalSymbol(BzeroName, TLI->getPointerTy(DL)), std::move(Args));
} else {
TargetLowering::ArgListTy Args;
- Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx)));
- Args.push_back(CreateEntry(Src, Src.getValueType().getTypeForEVT(Ctx)));
- Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+ Args.emplace_back(Dst, PointerType::getUnqual(Ctx));
+ Args.emplace_back(Src, Src.getValueType().getTypeForEVT(Ctx));
+ Args.emplace_back(Size, DL.getIntPtrType(Ctx));
CLI.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
Dst.getValueType().getTypeForEVT(Ctx),
getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
@@ -9276,18 +9362,9 @@ SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
MachinePointerInfo DstPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = getDataLayout().getIntPtrType(*getContext());
- Entry.Node = Dst;
- Args.push_back(Entry);
-
- Entry.Ty = Type::getInt8Ty(*getContext());
- Entry.Node = Value;
- Args.push_back(Entry);
-
- Entry.Ty = SizeTy;
- Entry.Node = Size;
- Args.push_back(Entry);
+ Args.emplace_back(Dst, getDataLayout().getIntPtrType(*getContext()));
+ Args.emplace_back(Value, Type::getInt8Ty(*getContext()));
+ Args.emplace_back(Size, SizeTy);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
@@ -12264,8 +12341,6 @@ static bool gluePropagatesDivergence(const SDNode *Node) {
}
bool SelectionDAG::calculateDivergence(SDNode *N) {
- if (!DivergentTarget)
- return false;
if (TLI->isSDNodeAlwaysUniform(N)) {
assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
"Conflicting divergence information!");
@@ -12285,8 +12360,6 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
}
void SelectionDAG::updateDivergence(SDNode *N) {
- if (!DivergentTarget)
- return;
SmallVector<SDNode *, 16> Worklist(1, N);
do {
N = Worklist.pop_back_val();
@@ -13847,20 +13920,16 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
Ops[I].setInitial(Vals[I]);
EVT VT = Ops[I].getValueType();
- // Take care of the Node's operands iff target has divergence
// Skip Chain. It does not carry divergence.
- if (DivergentTarget && VT != MVT::Other &&
+ if (VT != MVT::Other &&
(VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
Ops[I].getNode()->isDivergent()) {
- // Node is going to be divergent if at least one of its operand is
- // divergent, unless it belongs to the "AlwaysUniform" exemptions.
IsDivergent = true;
}
}
Node->NumOperands = Vals.size();
Node->OperandList = Ops;
- // Check the divergence of the Node itself.
- if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) {
+ if (!TLI->isSDNodeAlwaysUniform(Node)) {
IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
Node->SDNodeBits.IsDivergent = IsDivergent;
}
@@ -13950,10 +14019,7 @@ SDValue SelectionDAG::makeStateFunctionCall(unsigned LibFunc, SDValue Ptr,
const SDLoc &DLoc) {
assert(InChain.getValueType() == MVT::Other && "Expected token chain");
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = Ptr;
- Entry.Ty = Ptr.getValueType().getTypeForEVT(*getContext());
- Args.push_back(Entry);
+ Args.emplace_back(Ptr, Ptr.getValueType().getTypeForEVT(*getContext()));
RTLIB::Libcall LC = static_cast<RTLIB::Libcall>(LibFunc);
SDValue Callee = getExternalSymbol(TLI->getLibcallName(LC),
TLI->getPointerTy(getDataLayout()));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f5f5c14..901f10d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1837,11 +1837,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
getValue(CPA->getDiscriminator()));
}
- if (isa<ConstantPointerNull>(C)) {
- unsigned AS = V->getType()->getPointerAddressSpace();
- return DAG.getConstant(0, getCurSDLoc(),
- TLI.getPointerTy(DAG.getDataLayout(), AS));
- }
+ if (isa<ConstantPointerNull>(C))
+ return DAG.getConstant(0, getCurSDLoc(), VT);
if (match(C, m_VScale()))
return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1));
@@ -2211,9 +2208,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
MVT::Other, Chains);
} else if (I.getNumOperands() != 0) {
- SmallVector<EVT, 4> ValueVTs;
- ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
- unsigned NumValues = ValueVTs.size();
+ SmallVector<Type *, 4> Types;
+ ComputeValueTypes(DL, I.getOperand(0)->getType(), Types);
+ unsigned NumValues = Types.size();
if (NumValues) {
SDValue RetOp = getValue(I.getOperand(0));
@@ -2233,7 +2230,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
for (unsigned j = 0; j != NumValues; ++j) {
- EVT VT = ValueVTs[j];
+ EVT VT = TLI.getValueType(DL, Types[j]);
if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
@@ -2273,8 +2270,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
Flags.setNoExt();
for (unsigned i = 0; i < NumParts; ++i) {
- Outs.push_back(ISD::OutputArg(
- Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0));
+ Outs.push_back(ISD::OutputArg(Flags,
+ Parts[i].getValueType().getSimpleVT(),
+ VT, Types[j], 0, 0));
OutVals.push_back(Parts[i]);
}
}
@@ -2292,6 +2290,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
Flags.setSwiftError();
Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL),
/*argvt=*/EVT(TLI.getPointerTy(DL)),
+ PointerType::getUnqual(*DAG.getContext()),
/*origidx=*/1, /*partOffs=*/0));
// Create SDNode for the swifterror virtual register.
OutVals.push_back(
@@ -3107,9 +3106,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
assert(FnTy->getNumParams() == 1 && "Invalid function signature");
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = GuardVal;
- Entry.Ty = FnTy->getParamType(0);
+ TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0));
if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
Entry.IsInReg = true;
Args.push_back(Entry);
@@ -3206,9 +3203,7 @@ void SelectionDAGBuilder::visitSPDescriptorFailure(
assert(FnTy->getNumParams() == 1 && "Invalid function signature");
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = GuardVal;
- Entry.Ty = FnTy->getParamType(0);
+ TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0));
if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
Entry.IsInReg = true;
Args.push_back(Entry);
@@ -3578,7 +3573,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
// Update machine-CFG edges with unique successors.
- SmallSet<BasicBlock*, 32> Done;
+ SmallPtrSet<BasicBlock *, 32> Done;
for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
BasicBlock *BB = I.getSuccessor(i);
bool Inserted = Done.insert(BB).second;
@@ -3977,6 +3972,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
}
+void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
+ // FIXME: this is not correct for pointers with addr width != pointer width
+ visitPtrToInt(I);
+}
+
void SelectionDAGBuilder::visitPtrToInt(const User &I) {
// What to do depends on the size of the integer and the size of the pointer.
// We can either truncate, zero extend, or no-op, accordingly.
@@ -4902,9 +4902,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
// extract the splat value and use it as a uniform base.
// In all other cases the function returns 'false'.
static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
- ISD::MemIndexType &IndexType, SDValue &Scale,
- SelectionDAGBuilder *SDB, const BasicBlock *CurBB,
- uint64_t ElemSize) {
+ SDValue &Scale, SelectionDAGBuilder *SDB,
+ const BasicBlock *CurBB, uint64_t ElemSize) {
SelectionDAG& DAG = SDB->DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const DataLayout &DL = DAG.getDataLayout();
@@ -4922,7 +4921,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
ElementCount NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts);
Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT);
- IndexType = ISD::SIGNED_SCALED;
Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
return true;
}
@@ -4952,7 +4950,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
Base = SDB->getValue(BasePtr);
Index = SDB->getValue(IndexVal);
- IndexType = ISD::SIGNED_SCALED;
Scale =
DAG.getTargetConstant(ScaleVal, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
@@ -4974,9 +4971,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
SDValue Base;
SDValue Index;
- ISD::MemIndexType IndexType;
SDValue Scale;
- bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+ bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
I.getParent(), VT.getScalarStoreSize());
unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
@@ -4986,8 +4982,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
- IndexType = ISD::SIGNED_SCALED;
- Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+ Scale =
+ DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
EVT IdxVT = Index.getValueType();
@@ -4999,7 +4995,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
- Ops, MMO, IndexType, false);
+ Ops, MMO, ISD::SIGNED_SCALED, false);
DAG.setRoot(Scatter);
setValue(&I, Scatter);
}
@@ -5092,9 +5088,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
SDValue Root = DAG.getRoot();
SDValue Base;
SDValue Index;
- ISD::MemIndexType IndexType;
SDValue Scale;
- bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+ bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
I.getParent(), VT.getScalarStoreSize());
unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@@ -5105,8 +5100,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
- IndexType = ISD::SIGNED_SCALED;
- Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+ Scale =
+ DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
EVT IdxVT = Index.getValueType();
@@ -5117,8 +5112,9 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
}
SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
- SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
- Ops, MMO, IndexType, ISD::NON_EXTLOAD);
+ SDValue Gather =
+ DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl, Ops, MMO,
+ ISD::SIGNED_SCALED, ISD::NON_EXTLOAD);
PendingLoads.push_back(Gather.getValue(1));
setValue(&I, Gather);
@@ -6431,9 +6427,8 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
SDValue Root = DAG.getRoot();
SDValue Base;
SDValue Index;
- ISD::MemIndexType IndexType;
SDValue Scale;
- bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+ bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this,
I.getParent(), VT.getScalarStoreSize());
unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
@@ -6446,7 +6441,6 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
- IndexType = ISD::SIGNED_SCALED;
Scale =
DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
@@ -6462,7 +6456,7 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID};
SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl,
- Ops, MMO, IndexType);
+ Ops, MMO, ISD::SIGNED_SCALED);
setValue(&I, Histogram);
DAG.setRoot(Histogram);
@@ -7514,10 +7508,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
TargetLowering::ArgListTy Args;
if (Intrinsic == Intrinsic::ubsantrap) {
- Args.push_back(TargetLoweringBase::ArgListEntry());
- Args[0].Val = I.getArgOperand(0);
- Args[0].Node = getValue(Args[0].Val);
- Args[0].Ty = Args[0].Val->getType();
+ Value *Arg = I.getArgOperand(0);
+ Args.emplace_back(Arg, getValue(Arg));
}
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -7597,7 +7589,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
if (TM.getOptLevel() == CodeGenOptLevel::None)
return;
- const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1));
+ const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(0));
if (!LifetimeObject)
return;
@@ -7946,9 +7938,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
Args.reserve(3);
for (unsigned Idx : {2, 3, 1}) {
- TargetLowering::ArgListEntry Arg;
- Arg.Node = getValue(I.getOperand(Idx));
- Arg.Ty = I.getOperand(Idx)->getType();
+ TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)),
+ I.getOperand(Idx)->getType());
Arg.setAttributes(&I, Idx);
Args.push_back(Arg);
}
@@ -7959,9 +7950,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
// Forward the flags and any additional arguments.
for (unsigned Idx = 4; Idx < I.arg_size(); ++Idx) {
- TargetLowering::ArgListEntry Arg;
- Arg.Node = getValue(I.getOperand(Idx));
- Arg.Ty = I.getOperand(Idx)->getType();
+ TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)),
+ I.getOperand(Idx)->getType());
Arg.setAttributes(&I, Idx);
Args.push_back(Arg);
}
@@ -7983,6 +7973,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
HasTailCall = true;
return;
}
+ case Intrinsic::amdgcn_call_whole_wave: {
+ TargetLowering::ArgListTy Args;
+
+ // The first argument is the callee. Skip it when assembling the call args.
+ for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
+ TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)),
+ I.getArgOperand(Idx)->getType());
+ Arg.setAttributes(&I, Idx);
+ Args.push_back(Arg);
+ }
+
+ SDValue ConvControlToken;
+ if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+ auto *Token = Bundle->Inputs[0].get();
+ ConvControlToken = getValue(Token);
+ }
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(getCurSDLoc())
+ .setChain(getRoot())
+ .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
+ getValue(I.getArgOperand(0)), std::move(Args))
+ .setTailCall(false)
+ .setIsPreallocated(
+ I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
+ .setConvergent(I.isConvergent())
+ .setConvergenceControlToken(ConvControlToken);
+ CLI.CB = &I;
+
+ std::pair<SDValue, SDValue> Result =
+ lowerInvokable(CLI, /*EHPadBB=*/nullptr);
+
+ if (Result.first.getNode())
+ setValue(&I, Result.first);
+ return;
+ }
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
@@ -8487,14 +8513,12 @@ void SelectionDAGBuilder::visitVPGather(
MachinePointerInfo(AS), MachineMemOperand::MOLoad,
LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
SDValue Base, Index, Scale;
- ISD::MemIndexType IndexType;
- bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
- this, VPIntrin.getParent(),
- VT.getScalarStoreSize());
+ bool UniformBase =
+ getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
+ VT.getScalarStoreSize());
if (!UniformBase) {
Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(PtrOperand);
- IndexType = ISD::SIGNED_SCALED;
Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
}
EVT IdxVT = Index.getValueType();
@@ -8506,7 +8530,7 @@ void SelectionDAGBuilder::visitVPGather(
LD = DAG.getGatherVP(
DAG.getVTList(VT, MVT::Other), VT, DL,
{DAG.getRoot(), Base, Index, Scale, OpValues[1], OpValues[2]}, MMO,
- IndexType);
+ ISD::SIGNED_SCALED);
PendingLoads.push_back(LD.getValue(1));
setValue(&VPIntrin, LD);
}
@@ -8550,16 +8574,13 @@ void SelectionDAGBuilder::visitVPScatter(
MachinePointerInfo(AS), MachineMemOperand::MOStore,
LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
SDValue Base, Index, Scale;
- ISD::MemIndexType IndexType;
- bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
- this, VPIntrin.getParent(),
- VT.getScalarStoreSize());
+ bool UniformBase =
+ getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
+ VT.getScalarStoreSize());
if (!UniformBase) {
Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(PtrOperand);
- IndexType = ISD::SIGNED_SCALED;
- Scale =
- DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+ Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
}
EVT IdxVT = Index.getValueType();
EVT EltTy = IdxVT.getVectorElementType();
@@ -8570,7 +8591,7 @@ void SelectionDAGBuilder::visitVPScatter(
ST = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, DL,
{getMemoryRoot(), OpValues[0], Base, Index, Scale,
OpValues[2], OpValues[3]},
- MMO, IndexType);
+ MMO, ISD::SIGNED_SCALED);
DAG.setRoot(ST);
setValue(&VPIntrin, ST);
}
@@ -8912,7 +8933,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
}
for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
- TargetLowering::ArgListEntry Entry;
const Value *V = *I;
// Skip empty types
@@ -8920,8 +8940,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
continue;
SDValue ArgNode = getValue(V);
- Entry.Node = ArgNode; Entry.Ty = V->getType();
-
+ TargetLowering::ArgListEntry Entry(ArgNode, V->getType());
Entry.setAttributes(&CB, I - CB.arg_begin());
// Use swifterror virtual register as input to the call.
@@ -8945,11 +8964,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
// If call site has a cfguardtarget operand bundle, create and add an
// additional ArgListEntry.
if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_cfguardtarget)) {
- TargetLowering::ArgListEntry Entry;
Value *V = Bundle->Inputs[0];
- SDValue ArgNode = getValue(V);
- Entry.Node = ArgNode;
- Entry.Ty = V->getType();
+ TargetLowering::ArgListEntry Entry(V, getValue(V));
Entry.IsCFGuardTarget = true;
Args.push_back(Entry);
}
@@ -10612,9 +10628,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
- TargetLowering::ArgListEntry Entry;
- Entry.Node = getValue(V);
- Entry.Ty = V->getType();
+ TargetLowering::ArgListEntry Entry(getValue(V), V->getType());
Entry.setAttributes(Call, ArgI);
Args.push_back(Entry);
}
@@ -10974,27 +10988,42 @@ static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
/// migrated to using LowerCall, this hook should be integrated into SDISel.
std::pair<SDValue, SDValue>
TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
+ LLVMContext &Context = CLI.RetTy->getContext();
+
// Handle the incoming return values from the call.
CLI.Ins.clear();
- SmallVector<EVT, 4> RetTys;
+ SmallVector<Type *, 4> RetOrigTys;
SmallVector<TypeSize, 4> Offsets;
auto &DL = CLI.DAG.getDataLayout();
- ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
+ ComputeValueTypes(DL, CLI.OrigRetTy, RetOrigTys, &Offsets);
+
+ SmallVector<EVT, 4> RetVTs;
+ if (CLI.RetTy != CLI.OrigRetTy) {
+ assert(RetOrigTys.size() == 1 &&
+ "Only supported for non-aggregate returns");
+ RetVTs.push_back(getValueType(DL, CLI.RetTy));
+ } else {
+ for (Type *Ty : RetOrigTys)
+ RetVTs.push_back(getValueType(DL, Ty));
+ }
if (CLI.IsPostTypeLegalization) {
// If we are lowering a libcall after legalization, split the return type.
- SmallVector<EVT, 4> OldRetTys;
+ SmallVector<Type *, 4> OldRetOrigTys;
+ SmallVector<EVT, 4> OldRetVTs;
SmallVector<TypeSize, 4> OldOffsets;
- RetTys.swap(OldRetTys);
+ RetOrigTys.swap(OldRetOrigTys);
+ RetVTs.swap(OldRetVTs);
Offsets.swap(OldOffsets);
- for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
- EVT RetVT = OldRetTys[i];
+ for (size_t i = 0, e = OldRetVTs.size(); i != e; ++i) {
+ EVT RetVT = OldRetVTs[i];
uint64_t Offset = OldOffsets[i];
- MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
- unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
+ MVT RegisterVT = getRegisterType(Context, RetVT);
+ unsigned NumRegs = getNumRegisters(Context, RetVT);
unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
- RetTys.append(NumRegs, RegisterVT);
+ RetOrigTys.append(NumRegs, OldRetOrigTys[i]);
+ RetVTs.append(NumRegs, RegisterVT);
for (unsigned j = 0; j != NumRegs; ++j)
Offsets.push_back(TypeSize::getFixed(Offset + j * RegisterVTByteSZ));
}
@@ -11005,7 +11034,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
bool CanLowerReturn =
this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
- CLI.IsVarArg, Outs, CLI.RetTy->getContext(), CLI.RetTy);
+ CLI.IsVarArg, Outs, Context, CLI.RetTy);
SDValue DemoteStackSlot;
int DemoteStackIdx = -100;
@@ -11018,30 +11047,16 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
MachineFunction &MF = CLI.DAG.getMachineFunction();
DemoteStackIdx =
MF.getFrameInfo().CreateStackObject(TySize, Alignment, false);
- Type *StackSlotPtrType =
- PointerType::get(CLI.RetTy->getContext(), DL.getAllocaAddrSpace());
+ Type *StackSlotPtrType = PointerType::get(Context, DL.getAllocaAddrSpace());
DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
- ArgListEntry Entry;
- Entry.Node = DemoteStackSlot;
- Entry.Ty = StackSlotPtrType;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Entry.IsInReg = false;
+ ArgListEntry Entry(DemoteStackSlot, StackSlotPtrType);
Entry.IsSRet = true;
- Entry.IsNest = false;
- Entry.IsByVal = false;
- Entry.IsByRef = false;
- Entry.IsReturned = false;
- Entry.IsSwiftSelf = false;
- Entry.IsSwiftAsync = false;
- Entry.IsSwiftError = false;
- Entry.IsCFGuardTarget = false;
Entry.Alignment = Alignment;
CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
CLI.NumFixedArgs += 1;
CLI.getArgs()[0].IndirectType = CLI.RetTy;
- CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+ CLI.RetTy = CLI.OrigRetTy = Type::getVoidTy(Context);
// sret demotion isn't compatible with tail-calls, since the sret argument
// points into the callers stack frame.
@@ -11049,36 +11064,32 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
} else {
bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL);
- for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+ for (unsigned I = 0, E = RetVTs.size(); I != E; ++I) {
ISD::ArgFlagsTy Flags;
if (NeedsRegBlock) {
Flags.setInConsecutiveRegs();
- if (I == RetTys.size() - 1)
+ if (I == RetVTs.size() - 1)
Flags.setInConsecutiveRegsLast();
}
- EVT VT = RetTys[I];
- MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
- unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
+ EVT VT = RetVTs[I];
+ MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+ unsigned NumRegs =
+ getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
for (unsigned i = 0; i != NumRegs; ++i) {
- ISD::InputArg MyFlags;
- MyFlags.Flags = Flags;
- MyFlags.VT = RegisterVT;
- MyFlags.ArgVT = VT;
- MyFlags.Used = CLI.IsReturnValueUsed;
+ ISD::InputArg Ret(Flags, RegisterVT, VT, RetOrigTys[I],
+ CLI.IsReturnValueUsed, ISD::InputArg::NoArgIndex, 0);
if (CLI.RetTy->isPointerTy()) {
- MyFlags.Flags.setPointer();
- MyFlags.Flags.setPointerAddrSpace(
+ Ret.Flags.setPointer();
+ Ret.Flags.setPointerAddrSpace(
cast<PointerType>(CLI.RetTy)->getAddressSpace());
}
if (CLI.RetSExt)
- MyFlags.Flags.setSExt();
+ Ret.Flags.setSExt();
if (CLI.RetZExt)
- MyFlags.Flags.setZExt();
+ Ret.Flags.setZExt();
if (CLI.IsInReg)
- MyFlags.Flags.setInReg();
- CLI.Ins.push_back(MyFlags);
+ Ret.Flags.setInReg();
+ CLI.Ins.push_back(Ret);
}
}
}
@@ -11088,11 +11099,12 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
if (supportSwiftError()) {
for (const ArgListEntry &Arg : Args) {
if (Arg.IsSwiftError) {
- ISD::InputArg MyFlags;
- MyFlags.VT = getPointerTy(DL);
- MyFlags.ArgVT = EVT(getPointerTy(DL));
- MyFlags.Flags.setSwiftError();
- CLI.Ins.push_back(MyFlags);
+ ISD::ArgFlagsTy Flags;
+ Flags.setSwiftError();
+ ISD::InputArg Ret(Flags, getPointerTy(DL), EVT(getPointerTy(DL)),
+ PointerType::getUnqual(Context),
+ /*Used=*/true, ISD::InputArg::NoArgIndex, 0);
+ CLI.Ins.push_back(Ret);
}
}
}
@@ -11101,18 +11113,24 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
CLI.Outs.clear();
CLI.OutVals.clear();
for (unsigned i = 0, e = Args.size(); i != e; ++i) {
- SmallVector<EVT, 4> ValueVTs;
- ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+ SmallVector<Type *, 4> OrigArgTys;
+ ComputeValueTypes(DL, Args[i].OrigTy, OrigArgTys);
// FIXME: Split arguments if CLI.IsPostTypeLegalization
Type *FinalType = Args[i].Ty;
if (Args[i].IsByVal)
FinalType = Args[i].IndirectType;
bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
FinalType, CLI.CallConv, CLI.IsVarArg, DL);
- for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+ for (unsigned Value = 0, NumValues = OrigArgTys.size(); Value != NumValues;
++Value) {
- EVT VT = ValueVTs[Value];
- Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+ Type *OrigArgTy = OrigArgTys[Value];
+ Type *ArgTy = OrigArgTy;
+ if (Args[i].Ty != Args[i].OrigTy) {
+ assert(Value == 0 && "Only supported for non-aggregate arguments");
+ ArgTy = Args[i].Ty;
+ }
+
+ EVT VT = getValueType(DL, ArgTy);
SDValue Op = SDValue(Args[i].Node.getNode(),
Args[i].Node.getResNo() + Value);
ISD::ArgFlagsTy Flags;
@@ -11125,10 +11143,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
if (i >= CLI.NumFixedArgs)
Flags.setVarArg();
- if (Args[i].Ty->isPointerTy()) {
+ if (ArgTy->isPointerTy()) {
Flags.setPointer();
- Flags.setPointerAddrSpace(
- cast<PointerType>(Args[i].Ty)->getAddressSpace());
+ Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
}
if (Args[i].IsZExt)
Flags.setZExt();
@@ -11202,10 +11219,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
if (NeedsRegBlock)
Flags.setInConsecutiveRegs();
- MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
- unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
+ MVT PartVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+ unsigned NumParts =
+ getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
SmallVector<SDValue, 4> Parts(NumParts);
ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
@@ -11222,7 +11238,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
(CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() &&
CLI.RetTy->getPointerAddressSpace() ==
Args[i].Ty->getPointerAddressSpace())) &&
- RetTys.size() == NumValues && "unexpected use of 'returned'");
+ RetVTs.size() == NumValues && "unexpected use of 'returned'");
// Before passing 'returned' to the target lowering code, ensure that
// either the register MVT and the actual EVT are the same size or that
// the return value and argument are extended in the same way; in these
@@ -11247,7 +11263,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
// For scalable vectors the scalable part is currently handled
// by individual targets, so we just use the known minimum size here.
ISD::OutputArg MyFlags(
- Flags, Parts[j].getValueType().getSimpleVT(), VT, i,
+ Flags, Parts[j].getValueType().getSimpleVT(), VT, OrigArgTy, i,
j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
if (NumParts > 1 && j == 0)
MyFlags.Flags.setSplit();
@@ -11303,7 +11319,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
// hidden sret parameter.
MVT PtrVT = getPointerTy(DL, DL.getAllocaAddrSpace());
- unsigned NumValues = RetTys.size();
+ unsigned NumValues = RetVTs.size();
ReturnValues.resize(NumValues);
SmallVector<SDValue, 4> Chains(NumValues);
@@ -11316,7 +11332,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, PtrVT),
CLI.DL, SDNodeFlags::NoUnsignedWrap);
SDValue L = CLI.DAG.getLoad(
- RetTys[i], CLI.DL, CLI.Chain, Add,
+ RetVTs[i], CLI.DL, CLI.Chain, Add,
MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
DemoteStackIdx, Offsets[i]),
HiddenSRetAlign);
@@ -11334,11 +11350,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
else if (CLI.RetZExt)
AssertOp = ISD::AssertZext;
unsigned CurReg = 0;
- for (EVT VT : RetTys) {
- MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
- unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
- CLI.CallConv, VT);
+ for (EVT VT : RetVTs) {
+ MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT);
+ unsigned NumRegs =
+ getNumRegistersForCallingConv(Context, CLI.CallConv, VT);
ReturnValues.push_back(getCopyFromParts(
CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr,
@@ -11354,7 +11369,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
}
SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
- CLI.DAG.getVTList(RetTys), ReturnValues);
+ CLI.DAG.getVTList(RetVTs), ReturnValues);
return std::make_pair(Res, CLI.Chain);
}
@@ -11625,7 +11640,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
ISD::ArgFlagsTy Flags;
Flags.setSRet();
MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVT);
- ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, true,
+ ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, F.getReturnType(), true,
ISD::InputArg::NoArgIndex, 0);
Ins.push_back(RetArg);
}
@@ -11640,8 +11655,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
// Set up the incoming argument description vector.
for (const Argument &Arg : F.args()) {
unsigned ArgNo = Arg.getArgNo();
- SmallVector<EVT, 4> ValueVTs;
- ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+ SmallVector<Type *, 4> Types;
+ ComputeValueTypes(DAG.getDataLayout(), Arg.getType(), Types);
bool isArgValueUsed = !Arg.use_empty();
unsigned PartBase = 0;
Type *FinalType = Arg.getType();
@@ -11649,17 +11664,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
FinalType = Arg.getParamByValType();
bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
FinalType, F.getCallingConv(), F.isVarArg(), DL);
- for (unsigned Value = 0, NumValues = ValueVTs.size();
- Value != NumValues; ++Value) {
- EVT VT = ValueVTs[Value];
- Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+ for (unsigned Value = 0, NumValues = Types.size(); Value != NumValues;
+ ++Value) {
+ Type *ArgTy = Types[Value];
+ EVT VT = TLI->getValueType(DL, ArgTy);
ISD::ArgFlagsTy Flags;
-
- if (Arg.getType()->isPointerTy()) {
+ if (ArgTy->isPointerTy()) {
Flags.setPointer();
- Flags.setPointerAddrSpace(
- cast<PointerType>(Arg.getType())->getAddressSpace());
+ Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
}
if (Arg.hasAttribute(Attribute::ZExt))
Flags.setZExt();
@@ -11763,7 +11776,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
// are responsible for handling scalable vector arguments and
// return values.
ISD::InputArg MyFlags(
- Flags, RegisterVT, VT, isArgValueUsed, ArgNo,
+ Flags, RegisterVT, VT, ArgTy, isArgValueUsed, ArgNo,
PartBase + i * RegisterVT.getStoreSize().getKnownMinValue());
if (NumRegs > 1 && i == 0)
MyFlags.Flags.setSplit();
@@ -12737,17 +12750,22 @@ static Register FollowCopyChain(MachineRegisterInfo &MRI, Register Reg) {
assert(MI->getOpcode() == TargetOpcode::COPY &&
"start of copy chain MUST be COPY");
Reg = MI->getOperand(1).getReg();
+
+ // If the copied register in the first copy must be virtual.
+ assert(Reg.isVirtual() && "expected COPY of virtual register");
MI = MRI.def_begin(Reg)->getParent();
+
// There may be an optional second copy.
if (MI->getOpcode() == TargetOpcode::COPY) {
assert(Reg.isVirtual() && "expected COPY of virtual register");
Reg = MI->getOperand(1).getReg();
assert(Reg.isPhysical() && "expected COPY of physical register");
- MI = MRI.def_begin(Reg)->getParent();
+ } else {
+ // The start of the chain must be an INLINEASM_BR.
+ assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR &&
+ "end of copy chain MUST be INLINEASM_BR");
}
- // The start of the chain must be an INLINEASM_BR.
- assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR &&
- "end of copy chain MUST be INLINEASM_BR");
+
return Reg;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c251755..e0835e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -574,6 +574,7 @@ private:
void visitFPToSI(const User &I);
void visitUIToFP(const User &I);
void visitSIToFP(const User &I);
+ void visitPtrToAddr(const User &I);
void visitPtrToInt(const User &I);
void visitIntToPtr(const User &I);
void visitBitCast(const User &I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 26071ed..ece50ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -480,10 +480,7 @@ void SelectionDAGISel::initializeAnalysisResults(
MachineModuleInfo &MMI =
MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
- TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
-
- CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
- TTI->hasBranchDivergence(&Fn));
+ CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -501,6 +498,10 @@ void SelectionDAGISel::initializeAnalysisResults(
BatchAA = std::nullopt;
SP = &FAM.getResult<SSPLayoutAnalysis>(Fn);
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+ TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
+#endif
}
void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
@@ -536,10 +537,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
MachineModuleInfo &MMI =
MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
- TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-
- CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
- TTI->hasBranchDivergence(&Fn));
+ CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -558,6 +556,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
BatchAA = std::nullopt;
SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo();
+
+#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
+ TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+#endif
}
bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 80aeefe..46a5e44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -1258,7 +1258,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
if (Record.type == RecordType::Spill) {
unsigned Index = Record.payload.FI;
- SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
+ SDValue SpillSlot = DAG.getFrameIndex(Index, getFrameIndexTy());
// All the reloads are independent and are reading memory only modified by
// statepoints (i.e. no other aliasing stores); informing SelectionDAG of
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e235d14..402a012 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -162,14 +162,17 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
TargetLowering::ArgListTy Args;
Args.reserve(Ops.size());
- TargetLowering::ArgListEntry Entry;
ArrayRef<Type *> OpsTypeOverrides = CallOptions.OpsTypeOverrides;
for (unsigned i = 0; i < Ops.size(); ++i) {
SDValue NewOp = Ops[i];
- Entry.Node = NewOp;
- Entry.Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i]
+ Type *Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i]
? OpsTypeOverrides[i]
- : Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+ : NewOp.getValueType().getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(NewOp, Ty);
+ if (CallOptions.IsSoften)
+ Entry.OrigTy =
+ CallOptions.OpsVTBeforeSoften[i].getTypeForEVT(*DAG.getContext());
+
Entry.IsSExt =
shouldSignExtendTypeInLibCall(Entry.Ty, CallOptions.IsSigned);
Entry.IsZExt = !Entry.IsSExt;
@@ -189,18 +192,21 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+ Type *OrigRetTy = RetTy;
TargetLowering::CallLoweringInfo CLI(DAG);
bool signExtend = shouldSignExtendTypeInLibCall(RetTy, CallOptions.IsSigned);
bool zeroExtend = !signExtend;
- if (CallOptions.IsSoften &&
- !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) {
- signExtend = zeroExtend = false;
+ if (CallOptions.IsSoften) {
+ OrigRetTy = CallOptions.RetVTBeforeSoften.getTypeForEVT(*DAG.getContext());
+ if (!shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften))
+ signExtend = zeroExtend = false;
}
CLI.setDebugLoc(dl)
.setChain(InChain)
- .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+ .setLibCallee(getLibcallCallingConv(LC), RetTy, OrigRetTy, Callee,
+ std::move(Args))
.setNoReturn(CallOptions.DoesNotReturn)
.setDiscardResult(!CallOptions.IsReturnValueUsed)
.setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization)
@@ -420,7 +426,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
TargetLowering::MakeLibCallOptions CallOptions;
EVT OpsVT[2] = { OldLHS.getValueType(),
OldRHS.getValueType() };
- CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+ CallOptions.setTypeListBeforeSoften(OpsVT, RetVT);
auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain);
NewLHS = Call.first;
NewRHS = DAG.getConstant(0, dl, RetVT);
@@ -5125,10 +5131,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
!ISD::isUnsignedIntSetCC(Cond))) &&
isTypeDesirableForOp(ISD::SETCC, N0.getOperand(0).getValueType())) {
EVT NewVT = N0.getOperand(0).getValueType();
- SDValue NewConst = DAG.getConstant(ISD::isSignedIntSetCC(Cond)
- ? C1.sext(NewVT.getSizeInBits())
- : C1.zext(NewVT.getSizeInBits()),
- dl, NewVT);
+ SDValue NewConst = DAG.getConstant(
+ (N0->getFlags().hasNoSignedWrap() && !ISD::isUnsignedIntSetCC(Cond))
+ ? C1.sext(NewVT.getSizeInBits())
+ : C1.zext(NewVT.getSizeInBits()),
+ dl, NewVT);
return DAG.getSetCC(dl, VT, N0.getOperand(0), NewConst, Cond);
}
@@ -10712,7 +10719,6 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
SDLoc dl(GA);
ArgListTy Args;
- ArgListEntry Entry;
const GlobalValue *GV =
cast<GlobalValue>(GA->getGlobal()->stripPointerCastsAndAliases());
SmallString<32> NameString("__emutls_v.");
@@ -10721,9 +10727,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
const GlobalVariable *EmuTlsVar =
GV->getParent()->getNamedGlobal(EmuTlsVarName);
assert(EmuTlsVar && "Cannot find EmuTlsVar ");
- Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
- Entry.Ty = VoidPtrType;
- Args.push_back(Entry);
+ Args.emplace_back(DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT), VoidPtrType);
SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 41e956c..938f2d7 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -967,8 +967,14 @@ bool ShrinkWrapImpl::run(MachineFunction &MF) {
<< "\nRestore: " << printMBBReference(*Restore) << '\n');
MachineFrameInfo &MFI = MF.getFrameInfo();
- MFI.setSavePoint(Save);
- MFI.setRestorePoint(Restore);
+ SmallVector<MachineBasicBlock *, 4> SavePoints;
+ SmallVector<MachineBasicBlock *, 4> RestorePoints;
+ if (Save) {
+ SavePoints.push_back(Save);
+ RestorePoints.push_back(Restore);
+ }
+ MFI.setSavePoints(SavePoints);
+ MFI.setRestorePoints(RestorePoints);
++NumCandidates;
return Changed;
}
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index decffdc..ff4b568 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -179,7 +179,7 @@ void SwiftErrorValueTracking::propagateVRegs() {
// Check whether we have a single vreg def from all predecessors.
// Otherwise we need a phi.
SmallVector<std::pair<MachineBasicBlock *, Register>, 4> VRegs;
- SmallSet<const MachineBasicBlock *, 8> Visited;
+ SmallPtrSet<const MachineBasicBlock *, 8> Visited;
for (auto *Pred : MBB->predecessors()) {
if (!Visited.insert(Pred).second)
continue;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bf4c9f9..350948a 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1738,13 +1738,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
AttributeList attr,
SmallVectorImpl<ISD::OutputArg> &Outs,
const TargetLowering &TLI, const DataLayout &DL) {
- SmallVector<EVT, 4> ValueVTs;
- ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
- unsigned NumValues = ValueVTs.size();
+ SmallVector<Type *, 4> Types;
+ ComputeValueTypes(DL, ReturnType, Types);
+ unsigned NumValues = Types.size();
if (NumValues == 0) return;
- for (unsigned j = 0, f = NumValues; j != f; ++j) {
- EVT VT = ValueVTs[j];
+ for (Type *Ty : Types) {
+ EVT VT = TLI.getValueType(DL, Ty);
ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
if (attr.hasRetAttr(Attribute::SExt))
@@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
Flags.setZExt();
for (unsigned i = 0; i < NumParts; ++i)
- Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0));
+ Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, Ty, 0, 0));
}
}
@@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
case SIToFP: return ISD::SINT_TO_FP;
case FPTrunc: return ISD::FP_ROUND;
case FPExt: return ISD::FP_EXTEND;
+ case PtrToAddr: return ISD::BITCAST;
case PtrToInt: return ISD::BITCAST;
case IntToPtr: return ISD::BITCAST;
case BitCast: return ISD::BITCAST;
@@ -1923,6 +1924,8 @@ int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const {
return ISD::FEXP;
case Intrinsic::exp2:
return ISD::FEXP2;
+ case Intrinsic::log:
+ return ISD::FLOG;
default:
return ISD::DELETED_NODE;
}
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 99ba893..972bd8f 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -99,7 +99,7 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
// Set preferred alignment if we are still able to realign the stack
auto &ST = MF->getSubtarget();
Align CurrentAlign = ST.getFrameLowering()->getStackAlign();
- if (Alignment > CurrentAlign && !ST.getRegisterInfo()->canRealignStack(*MF)) {
+ if (Alignment > CurrentAlign && !TRI->canRealignStack(*MF)) {
Alignment = CurrentAlign;
}
int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 80b4185..0df9137 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -275,7 +275,8 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) {
}
Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
- StringRef TheFeatures) {
+ StringRef TheFeatures,
+ StringRef TheCPU) {
std::string TargetLookupError;
const Target *TheTarget =
TargetRegistry::lookupTarget(TheTriple, TargetLookupError);
@@ -298,9 +299,8 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
MAI.reset(AsmInfo);
// Target subtargets.
- StringRef CPU;
MCSubtargetInfo *SubtargetInfo(
- TheTarget->createMCSubtargetInfo(TheTriple, CPU, TheFeatures));
+ TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures));
if (!SubtargetInfo)
return createStringError(errc::invalid_argument,
"no subtarget info for target " + TheTriple);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index e589551..2ff7081 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -1190,7 +1190,12 @@ Error LVCodeViewReader::loadTargetInfo(const ObjectFile &Obj) {
FeaturesValue = SubtargetFeatures();
}
FeaturesValue = *Features;
- return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+ StringRef CPU;
+ if (auto OptCPU = Obj.tryGetCPUName())
+ CPU = *OptCPU;
+
+ return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
}
Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
@@ -1200,8 +1205,9 @@ Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
TT.setOS(Triple::Win32);
StringRef TheFeature = "";
+ StringRef TheCPU = "";
- return loadGenericTargetInfo(TT.str(), TheFeature);
+ return loadGenericTargetInfo(TT.str(), TheFeature, TheCPU);
}
std::string LVCodeViewReader::getRegisterName(LVSmall Opcode,
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 696e2bc..62134df 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -956,10 +956,7 @@ LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
// Detect the architecture from the object file. We usually don't need OS
// info to lookup a target and create register info.
- Triple TT;
- TT.setArch(Triple::ArchType(Obj.getArch()));
- TT.setVendor(Triple::UnknownVendor);
- TT.setOS(Triple::UnknownOS);
+ Triple TT = Obj.makeTriple();
// Features to be passed to target/subtarget
Expected<SubtargetFeatures> Features = Obj.getFeatures();
@@ -969,7 +966,12 @@ Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
FeaturesValue = SubtargetFeatures();
}
FeaturesValue = *Features;
- return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+ StringRef CPU;
+ if (auto OptCPU = Obj.tryGetCPUName())
+ CPU = *OptCPU;
+
+ return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
}
void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 1bafed7..ba27aa87 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -64,7 +64,7 @@ public:
LLVM_DEBUG({
dbgs() << " Preserving debug section " << Sec.getName() << "\n";
});
- SmallSet<Block *, 8> PreservedBlocks;
+ SmallPtrSet<Block *, 8> PreservedBlocks;
for (auto *Sym : Sec.symbols()) {
bool NewPreservedBlock =
PreservedBlocks.insert(&Sym->getBlock()).second;
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index 8e4937d..91a3115 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -169,7 +169,7 @@ Error EPCIndirectStubsManager::createStubs(const StubInitsMap &StubInits) {
std::vector<tpctypes::UInt64Write> PtrUpdates;
for (auto &SI : StubInits)
PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
- static_cast<uint64_t>(SI.second.first.getValue())});
+ SI.second.first.getValue()});
return MemAccess.writeUInt64s(PtrUpdates);
}
default:
diff --git a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 19c000e..d460cf6 100644
--- a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -14,40 +14,39 @@
namespace llvm {
namespace orc {
-ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
- ThreadSafeContext TSCtx,
- GVPredicate ShouldCloneDef,
- GVModifier UpdateClonedDefSource) {
- assert(TSM && "Can not clone null module");
-
- if (!ShouldCloneDef)
- ShouldCloneDef = [](const GlobalValue &) { return true; };
-
- // First copy the source module into a buffer.
+static std::pair<std::string, SmallVector<char, 1>>
+serializeModule(const Module &M, GVPredicate ShouldCloneDef,
+ GVModifier UpdateClonedDefSource) {
std::string ModuleName;
SmallVector<char, 1> ClonedModuleBuffer;
- TSM.withModuleDo([&](Module &M) {
- ModuleName = M.getModuleIdentifier();
- std::set<GlobalValue *> ClonedDefsInSrc;
- ValueToValueMapTy VMap;
- auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
- if (ShouldCloneDef(*GV)) {
- ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
- return true;
- }
- return false;
- });
-
- if (UpdateClonedDefSource)
- for (auto *GV : ClonedDefsInSrc)
- UpdateClonedDefSource(*GV);
-
- BitcodeWriter BCWriter(ClonedModuleBuffer);
- BCWriter.writeModule(*Tmp);
- BCWriter.writeSymtab();
- BCWriter.writeStrtab();
+
+ ModuleName = M.getModuleIdentifier();
+ std::set<GlobalValue *> ClonedDefsInSrc;
+ ValueToValueMapTy VMap;
+ auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
+ if (ShouldCloneDef(*GV)) {
+ ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
+ return true;
+ }
+ return false;
});
+ if (UpdateClonedDefSource)
+ for (auto *GV : ClonedDefsInSrc)
+ UpdateClonedDefSource(*GV);
+
+ BitcodeWriter BCWriter(ClonedModuleBuffer);
+ BCWriter.writeModule(*Tmp);
+ BCWriter.writeSymtab();
+ BCWriter.writeStrtab();
+
+ return {std::move(ModuleName), std::move(ClonedModuleBuffer)};
+}
+
+ThreadSafeModule
+deserializeModule(std::string ModuleName,
+ const SmallVector<char, 1> &ClonedModuleBuffer,
+ ThreadSafeContext TSCtx) {
MemoryBufferRef ClonedModuleBufferRef(
StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
"cloned module buffer");
@@ -63,6 +62,40 @@ ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
return ThreadSafeModule(std::move(M), std::move(TSCtx));
}
+ThreadSafeModule
+cloneExternalModuleToContext(const Module &M, ThreadSafeContext TSCtx,
+ GVPredicate ShouldCloneDef,
+ GVModifier UpdateClonedDefSource) {
+
+ if (!ShouldCloneDef)
+ ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+ auto [ModuleName, ClonedModuleBuffer] = serializeModule(
+ M, std::move(ShouldCloneDef), std::move(UpdateClonedDefSource));
+
+ return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+ std::move(TSCtx));
+}
+
+ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
+ ThreadSafeContext TSCtx,
+ GVPredicate ShouldCloneDef,
+ GVModifier UpdateClonedDefSource) {
+ assert(TSM && "Can not clone null module");
+
+ if (!ShouldCloneDef)
+ ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+ // First copy the source module into a buffer.
+ auto [ModuleName, ClonedModuleBuffer] = TSM.withModuleDo([&](Module &M) {
+ return serializeModule(M, std::move(ShouldCloneDef),
+ std::move(UpdateClonedDefSource));
+ });
+
+ return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+ std::move(TSCtx));
+}
+
ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM,
GVPredicate ShouldCloneDef,
GVModifier UpdateClonedDefSource) {
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 08d6c78..d626803 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -654,11 +654,10 @@ bool RuntimeDyldELF::resolveLoongArch64ShortBranch(
if (Loc == GlobalSymbolTable.end())
return false;
const auto &SymInfo = Loc->second;
- Address =
- uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
- SymInfo.getOffset()));
+ Address = Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
+ SymInfo.getOffset());
} else {
- Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+ Address = Sections[Value.SectionID].getLoadAddress();
}
uint64_t Offset = RelI->getOffset();
uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
index d581311..4539146 100644
--- a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
@@ -76,7 +76,7 @@ BindingInfo BindingInfoBuilder::calculateBindingInfo(
// remove duplicates
Binding *NewEnd = llvm::unique(Bindings);
if (NewEnd != Bindings.end())
- Bindings.erase(NewEnd);
+ Bindings.erase(NewEnd, Bindings.end());
BindingInfo Info;
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 574883e..92c62b8 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Frontend/HLSL/HLSLRootSignature.h"
+#include "llvm/Support/DXILABI.h"
#include "llvm/Support/ScopedPrinter.h"
namespace llvm {
@@ -92,10 +93,9 @@ static raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
-static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
- OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
- dxbc::getResourceClasses());
-
+static raw_ostream &operator<<(raw_ostream &OS,
+ const dxil::ResourceClass &Type) {
+ OS << dxil::getResourceClassName(Type);
return OS;
}
@@ -153,8 +153,7 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) {
}
raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) {
- ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type));
- OS << "Root" << Type << "(" << Descriptor.Reg
+ OS << "Root" << Descriptor.Type << "(" << Descriptor.Reg
<< ", space = " << Descriptor.Space
<< ", visibility = " << Descriptor.Visibility
<< ", flags = " << Descriptor.Flags << ")";
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 1cda308..dece8f1 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -15,6 +15,7 @@
#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/Support/DXILABI.h"
#include "llvm/Support/ScopedPrinter.h"
using namespace llvm;
@@ -119,9 +120,7 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
IRBuilder<> Builder(Ctx);
- StringRef ResName =
- enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
- dxbc::getResourceClasses());
+ StringRef ResName = dxil::getResourceClassName(Descriptor.Type);
assert(!ResName.empty() && "Provided an invalid Resource Class");
SmallString<7> Name({"Root", ResName});
Metadata *Operands[] = {
@@ -161,9 +160,7 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
MDNode *MetadataBuilder::BuildDescriptorTableClause(
const DescriptorTableClause &Clause) {
IRBuilder<> Builder(Ctx);
- StringRef ResName =
- enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
- dxbc::getResourceClasses());
+ StringRef ResName = dxil::getResourceClassName(Clause.Type);
assert(!ResName.empty() && "Provided an invalid Resource Class");
Metadata *Operands[] = {
MDString::get(Ctx, ResName),
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 9d84aa8..72308a3d 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) {
// This Range is reserverved, therefore invalid, according to the spec
// https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal
bool verifyRegisterSpace(uint32_t RegisterSpace) {
- return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF);
+ return !(RegisterSpace >= 0xFFFFFFF0);
}
bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp
index 555e2a6..9e625b8 100644
--- a/llvm/lib/Frontend/OpenMP/OMP.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMP.cpp
@@ -190,7 +190,7 @@ bool isCombinedConstruct(Directive D) {
}
ArrayRef<unsigned> getOpenMPVersions() {
- static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60};
+ static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60, 61};
return Versions;
}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ea027e4..e9147a4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -151,6 +151,18 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
}
#endif
+/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
+/// debug location to the last instruction in the specified basic block if the
+/// insert point points to the end of the block.
+static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder,
+ llvm::IRBuilderBase::InsertPoint IP) {
+ Builder.restoreIP(IP);
+ llvm::BasicBlock *BB = Builder.GetInsertBlock();
+ llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
+ if (!BB->empty() && I == BB->end())
+ Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
+}
+
static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
if (T.isAMDGPU()) {
StringRef Features =
@@ -5918,7 +5930,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
}
- SmallSet<BasicBlock *, 8> Reachable;
+ SmallPtrSet<BasicBlock *, 8> Reachable;
// Get the basic blocks from the loop in which memref instructions
// can be found.
@@ -7235,7 +7247,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
if (!AfterIP)
return AfterIP.takeError();
- Builder.restoreIP(*AfterIP);
+ restoreIPandDebugLoc(Builder, *AfterIP);
if (IfCond)
return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
@@ -8993,7 +9005,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
Info.RTArgs.SizesArray = Builder.CreateAlloca(
SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
- Builder.restoreIP(CodeGenIP);
+ restoreIPandDebugLoc(Builder, CodeGenIP);
} else {
auto *SizesArrayInit = ConstantArray::get(
ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
@@ -9012,7 +9024,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
AllocaInst *Buffer = Builder.CreateAlloca(
SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
Buffer->setAlignment(OffloadSizeAlign);
- Builder.restoreIP(CodeGenIP);
+ restoreIPandDebugLoc(Builder, CodeGenIP);
Builder.CreateMemCpy(
Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
SizesArrayGbl, OffloadSizeAlign,
@@ -9022,7 +9034,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays(
Info.RTArgs.SizesArray = Buffer;
}
- Builder.restoreIP(CodeGenIP);
+ restoreIPandDebugLoc(Builder, CodeGenIP);
}
// The map types are always constant so we don't need to generate code to
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7159107..e200f36 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1311,14 +1311,15 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
break;
case 'l':
- if (Name.starts_with("lifetime.start") ||
- Name.starts_with("lifetime.end")) {
- // Unless remangling is required, do not upgrade the function declaration,
- // but do upgrade the calls.
- if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
- NewFn = *Result;
- else
- NewFn = F;
+ if ((Name.starts_with("lifetime.start") ||
+ Name.starts_with("lifetime.end")) &&
+ F->arg_size() == 2) {
+ Intrinsic::ID IID = Name.starts_with("lifetime.start")
+ ? Intrinsic::lifetime_start
+ : Intrinsic::lifetime_end;
+ rename(F);
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
+ F->getArg(0)->getType());
return true;
}
break;
@@ -5133,21 +5134,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- Value *Size = CI->getArgOperand(0);
- Value *Ptr = CI->getArgOperand(1);
- if (isa<AllocaInst>(Ptr)) {
+ if (CI->arg_size() != 2) {
DefaultCase();
return;
}
+ Value *Ptr = CI->getArgOperand(1);
// Try to strip pointer casts, such that the lifetime works on an alloca.
Ptr = Ptr->stripPointerCasts();
if (isa<AllocaInst>(Ptr)) {
// Don't use NewFn, as we might have looked through an addrspacecast.
if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
- NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+ NewCall = Builder.CreateLifetimeStart(Ptr);
else
- NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+ NewCall = Builder.CreateLifetimeEnd(Ptr);
break;
}
@@ -5391,7 +5391,7 @@ void llvm::UpgradeNVVMAnnotations(Module &M) {
return;
SmallVector<MDNode *, 8> NewNodes;
- SmallSet<const MDNode *, 8> SeenNodes;
+ SmallPtrSet<const MDNode *, 8> SeenNodes;
for (MDNode *MD : NamedMD->operands()) {
if (!SeenNodes.insert(MD).second)
continue;
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d4ad21e..6b202ba 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
return FoldBitCast(V, DestTy);
case Instruction::AddrSpaceCast:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return nullptr;
}
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index e09c139..b454c9a 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::AddrSpaceCast:
// Conservatively return getFull set.
@@ -871,7 +872,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
return ConstantRange(Lower.sext(DstTySize), Upper.sext(DstTySize));
}
-ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
+ConstantRange ConstantRange::truncate(uint32_t DstTySize,
+ unsigned NoWrapKind) const {
assert(getBitWidth() > DstTySize && "Not a value truncation");
if (isEmptySet())
return getEmpty(DstTySize);
@@ -885,22 +887,36 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
// We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
// then we do the union with [MaxValue, Upper)
if (isUpperWrapped()) {
- // If Upper is greater than or equal to MaxValue(DstTy), it covers the whole
- // truncated range.
- if (Upper.getActiveBits() > DstTySize || Upper.countr_one() == DstTySize)
+ // If Upper is greater than MaxValue(DstTy), it covers the whole truncated
+ // range.
+ if (Upper.getActiveBits() > DstTySize)
return getFull(DstTySize);
- Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
- UpperDiv.setAllBits();
-
- // Union covers the MaxValue case, so return if the remaining range is just
- // MaxValue(DstTy).
- if (LowerDiv == UpperDiv)
- return Union;
+ // For nuw the two parts are: [0, Upper) \/ [Lower, MaxValue(DstTy)]
+ if (NoWrapKind & TruncInst::NoUnsignedWrap) {
+ Union = ConstantRange(APInt::getZero(DstTySize), Upper.trunc(DstTySize));
+ UpperDiv = APInt::getOneBitSet(getBitWidth(), DstTySize);
+ } else {
+ // If Upper is equal to MaxValue(DstTy), it covers the whole truncated
+ // range.
+ if (Upper.countr_one() == DstTySize)
+ return getFull(DstTySize);
+ Union =
+ ConstantRange(APInt::getMaxValue(DstTySize), Upper.trunc(DstTySize));
+ UpperDiv.setAllBits();
+ // Union covers the MaxValue case, so return if the remaining range is
+ // just MaxValue(DstTy).
+ if (LowerDiv == UpperDiv)
+ return Union;
+ }
}
// Chop off the most significant bits that are past the destination bitwidth.
if (LowerDiv.getActiveBits() > DstTySize) {
+ // For trunc nuw if LowerDiv is greater than MaxValue(DstTy), the range is
+ // outside the whole truncated range.
+ if (NoWrapKind & TruncInst::NoUnsignedWrap)
+ return Union;
// Mask to just the signficant bits and subtract from LowerDiv/UpperDiv.
APInt Adjust = LowerDiv & APInt::getBitsSetFrom(getBitWidth(), DstTySize);
LowerDiv -= Adjust;
@@ -912,6 +928,10 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
return ConstantRange(LowerDiv.trunc(DstTySize),
UpperDiv.trunc(DstTySize)).unionWith(Union);
+ if (!LowerDiv.isZero() && NoWrapKind & TruncInst::NoUnsignedWrap)
+ return ConstantRange(LowerDiv.trunc(DstTySize), APInt::getZero(DstTySize))
+ .unionWith(Union);
+
// The truncated value wraps around. Check if we can do better than fullset.
if (UpperDivWidth == DstTySize + 1) {
// Clear the MSB so that UpperDiv wraps around.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a3c725b..c7e3113a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
case Instruction::SIToFP:
case Instruction::FPToUI:
case Instruction::FPToSI:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty,
llvm_unreachable("Invalid cast opcode");
case Instruction::Trunc:
return getTrunc(C, Ty, OnlyIfReduced);
+ case Instruction::PtrToAddr:
+ return getPtrToAddr(C, Ty, OnlyIfReduced);
case Instruction::PtrToInt:
return getPtrToInt(C, Ty, OnlyIfReduced);
case Instruction::IntToPtr:
@@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced);
}
+Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy,
+ bool OnlyIfReduced) {
+ assert(C->getType()->isPtrOrPtrVectorTy() &&
+ "PtrToAddr source must be pointer or pointer vector");
+ assert(DstTy->isIntOrIntVectorTy() &&
+ "PtrToAddr destination must be integer or integer vector");
+ assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+ if (isa<VectorType>(C->getType()))
+ assert(cast<VectorType>(C->getType())->getElementCount() ==
+ cast<VectorType>(DstTy)->getElementCount() &&
+ "Invalid cast between a different number of vector elements");
+ return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced);
+}
+
Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
bool OnlyIfReduced) {
assert(C->getType()->isPtrOrPtrVectorTy() &&
@@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const {
switch (getOpcode()) {
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index f7ef4aa..8b5965b 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2186,6 +2186,11 @@ void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind,
unwrap<GlobalObject>(Global)->setMetadata(Kind, unwrap<MDNode>(MD));
}
+void LLVMGlobalAddMetadata(LLVMValueRef Global, unsigned Kind,
+ LLVMMetadataRef MD) {
+ unwrap<GlobalObject>(Global)->addMetadata(Kind, *unwrap<MDNode>(MD));
+}
+
void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) {
unwrap<GlobalObject>(Global)->eraseMetadata(Kind);
}
@@ -2194,6 +2199,11 @@ void LLVMGlobalClearMetadata(LLVMValueRef Global) {
unwrap<GlobalObject>(Global)->clearMetadata();
}
+void LLVMGlobalAddDebugInfo(LLVMValueRef Global, LLVMMetadataRef GVE) {
+ unwrap<GlobalVariable>(Global)->addDebugInfo(
+ unwrap<DIGlobalVariableExpression>(GVE));
+}
+
/*--.. Operations on global variables ......................................--*/
LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index ab8ecee..8e523bc 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1896,29 +1896,8 @@ AssignmentInstRange at::getAssignmentInsts(DIAssignID *ID) {
return make_range(MapIt->second.begin(), MapIt->second.end());
}
-AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) {
- assert(ID && "Expected non-null ID");
- LLVMContext &Ctx = ID->getContext();
-
- auto *IDAsValue = MetadataAsValue::getIfExists(Ctx, ID);
-
- // The ID is only used wrapped in MetadataAsValue(ID), so lets check that
- // one of those already exists first.
- if (!IDAsValue)
- return make_range(Value::user_iterator(), Value::user_iterator());
-
- return make_range(IDAsValue->user_begin(), IDAsValue->user_end());
-}
-
void at::deleteAssignmentMarkers(const Instruction *Inst) {
- auto Range = getAssignmentMarkers(Inst);
- SmallVector<DbgVariableRecord *> DVRAssigns = getDVRAssignmentMarkers(Inst);
- if (Range.empty() && DVRAssigns.empty())
- return;
- SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end());
- for (auto *DAI : ToDelete)
- DAI->eraseFromParent();
- for (auto *DVR : DVRAssigns)
+ for (auto *DVR : getDVRAssignmentMarkers(Inst))
DVR->eraseFromParent();
}
@@ -1936,31 +1915,21 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
}
void at::deleteAll(Function *F) {
- SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
- SmallVector<DbgVariableRecord *, 12> DPToDelete;
for (BasicBlock &BB : *F) {
for (Instruction &I : BB) {
- for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+ for (DbgVariableRecord &DVR :
+ make_early_inc_range(filterDbgVars(I.getDbgRecordRange())))
if (DVR.isDbgAssign())
- DPToDelete.push_back(&DVR);
- if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
- ToDelete.push_back(DAI);
- else
- I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
+ DVR.eraseFromParent();
+
+ I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
}
}
- for (auto *DAI : ToDelete)
- DAI->eraseFromParent();
- for (auto *DVR : DPToDelete)
- DVR->eraseFromParent();
}
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-template <typename T>
-bool calculateFragmentIntersectImpl(
+bool at::calculateFragmentIntersect(
const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
- uint64_t SliceSizeInBits, const T *AssignRecord,
+ uint64_t SliceSizeInBits, const DbgVariableRecord *AssignRecord,
std::optional<DIExpression::FragmentInfo> &Result) {
// No overlap if this DbgRecord describes a killed location.
if (AssignRecord->isKillAddress())
@@ -1989,26 +1958,6 @@ bool calculateFragmentIntersectImpl(
BitExtractOffsetInBits, VarFrag, Result, OffsetFromLocationInBits);
}
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-bool at::calculateFragmentIntersect(
- const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
- uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DbgAssign,
- std::optional<DIExpression::FragmentInfo> &Result) {
- return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
- SliceSizeInBits, DbgAssign, Result);
-}
-
-/// FIXME: Remove this wrapper function and call
-/// DIExpression::calculateFragmentIntersect directly.
-bool at::calculateFragmentIntersect(
- const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
- uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
- std::optional<DIExpression::FragmentInfo> &Result) {
- return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
- SliceSizeInBits, DVRAssign, Result);
-}
-
/// Update inlined instructions' DIAssignID metadata. We need to do this
/// otherwise a function inlined more than once into the same function
/// will cause DIAssignID to be shared by many instructions.
@@ -2029,8 +1978,6 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
}
if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
- else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
- DAI->setAssignId(GetNewID(DAI->getAssignID()));
}
/// Collect constant properies (base, size, offset) of \p StoreDest.
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f1d4549..96065ed 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
ArrayRef<Metadata *> MDs, bool ImplicitCode)
- : MDNode(C, DILocationKind, Storage, MDs)
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- ,
- AtomGroup(AtomGroup), AtomRank(AtomRank)
-#endif
-{
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
+ : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup),
+ AtomRank(AtomRank) {
assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits");
-#endif
if (AtomGroup)
C.updateDILocationAtomGroupWaterline(AtomGroup + 1);
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 7b799c7..11d33e2 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
return findBaseObject(CE->getOperand(0), Aliases, Op);
}
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::GetElementPtr:
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 49c6dc7..614c3a9 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -411,28 +411,16 @@ CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src);
}
-CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
- if (!Size)
- Size = getInt64(-1);
- else
- assert(Size->getType() == getInt64Ty() &&
- "lifetime.start requires the size to be an i64");
- Value *Ops[] = { Size, Ptr };
- return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops);
+ return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, {Ptr});
}
-CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.end only applies to pointers.");
- if (!Size)
- Size = getInt64(-1);
- else
- assert(Size->getType() == getInt64Ty() &&
- "lifetime.end requires the size to be an i64");
- Value *Ops[] = { Size, Ptr };
- return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops);
+ return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, {Ptr});
}
CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index b7cd12a..5e87b5f 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -26,9 +26,18 @@
#include "llvm/IR/Operator.h"
#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
using namespace llvm;
+// FIXME: Flag used for an ablation performance test, Issue #147390. Placing it
+// here because referencing IR should be feasible from anywhere. Will be
+// removed after the ablation test.
+cl::opt<bool> ProfcheckDisableMetadataFixes(
+ "profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false),
+ cl::desc(
+ "Disable metadata propagation fixes discovered through Issue #147390"));
+
InsertPosition::InsertPosition(Instruction *InsertBefore)
: InsertAt(InsertBefore ? InsertBefore->getIterator()
: InstListType::iterator()) {}
@@ -543,14 +552,19 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata(
CB->removeRetAttrs(UBImplyingAttributes);
}
-void Instruction::dropUBImplyingAttrsAndMetadata() {
+void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef<unsigned> Keep) {
// !annotation metadata does not impact semantics.
// !range, !nonnull and !align produce poison, so they are safe to speculate.
// !noundef and various AA metadata must be dropped, as it generally produces
// immediate undefined behavior.
- unsigned KnownIDs[] = {LLVMContext::MD_annotation, LLVMContext::MD_range,
- LLVMContext::MD_nonnull, LLVMContext::MD_align};
- dropUBImplyingAttrsAndUnknownMetadata(KnownIDs);
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_annotation, LLVMContext::MD_range,
+ LLVMContext::MD_nonnull, LLVMContext::MD_align};
+ SmallVector<unsigned> KeepIDs;
+ KeepIDs.reserve(Keep.size() + std::size(KnownIDs));
+ append_range(KeepIDs, KnownIDs);
+ append_range(KeepIDs, Keep);
+ dropUBImplyingAttrsAndUnknownMetadata(KeepIDs);
}
bool Instruction::hasUBImplyingAttrs() const {
@@ -817,6 +831,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
case UIToFP: return "uitofp";
case SIToFP: return "sitofp";
case IntToPtr: return "inttoptr";
+ case PtrToAddr: return "ptrtoaddr";
case PtrToInt: return "ptrtoint";
case BitCast: return "bitcast";
case AddrSpaceCast: return "addrspacecast";
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b896382..a1751c0 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
return false;
case Instruction::BitCast:
return true; // BitCast never modifies bits.
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() ==
DestTy->getScalarSizeInBits();
@@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair(
// same reason.
const unsigned numCastOps =
Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+ // clang-format off
static const uint8_t CastResults[numCastOps][numCastOps] = {
- // T F F U S F F P I B A -+
- // R Z S P P I I T P 2 N T S |
- // U E E 2 2 2 2 R E I T C C +- secondOp
- // N X X U S F F N X N 2 V V |
- // C T T I I P P C T T P T T -+
- { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+
- { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt |
- { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc |
- { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt |
- { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt |
- { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr |
- { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast |
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
+ // T F F U S F F P P I B A -+
+ // R Z S P P I I T P 2 2 N T S |
+ // U E E 2 2 2 2 R E I A T C C +- secondOp
+ // N X X U S F F N X N D 2 V V |
+ // C T T I I P P C T T R P T T -+
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+
+ { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt |
+ { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc |
+ { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr |
+ { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr |
+ { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast |
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
};
+ // clang-format on
// TODO: This logic could be encoded into the table above and handled in the
// switch below.
@@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore);
case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore);
case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore);
+ case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore);
case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
case BitCast:
@@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
case Instruction::FPToSI:
return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
SrcEC == DstEC;
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
if (SrcEC != DstEC)
return false;
@@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name,
assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
}
+PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name,
+ InsertPosition InsertBefore)
+ : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) {
+ assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr");
+}
+
IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name,
InsertPosition InsertBefore)
: CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
@@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const {
return new PtrToIntInst(getOperand(0), getType());
}
+PtrToAddrInst *PtrToAddrInst::cloneImpl() const {
+ return new PtrToAddrInst(getOperand(0), getType());
+}
+
IntToPtrInst *IntToPtrInst::cloneImpl() const {
return new IntToPtrInst(getOperand(0), getType());
}
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index aa2a60e..e03f993 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
template <> struct MDNodeKeyImpl<DILocation> {
Metadata *Scope;
Metadata *InlinedAt;
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
uint64_t AtomGroup : 61;
uint64_t AtomRank : 3;
-#endif
unsigned Line;
uint16_t Column;
bool ImplicitCode;
@@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> {
MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
uint8_t AtomRank)
- : Scope(Scope), InlinedAt(InlinedAt),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- AtomGroup(AtomGroup), AtomRank(AtomRank),
-#endif
- Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
- }
+ : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup),
+ AtomRank(AtomRank), Line(Line), Column(Column),
+ ImplicitCode(ImplicitCode) {}
MDNodeKeyImpl(const DILocation *L)
: Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
-#endif
Line(L->getLine()), Column(L->getColumn()),
- ImplicitCode(L->isImplicitCode()) {
- }
+ ImplicitCode(L->isImplicitCode()) {}
bool isKeyOf(const DILocation *RHS) const {
return Line == RHS->getLine() && Column == RHS->getColumn() &&
Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() &&
- ImplicitCode == RHS->isImplicitCode()
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- && AtomGroup == RHS->getAtomGroup() &&
- AtomRank == RHS->getAtomRank();
-#else
- ;
-#endif
+ ImplicitCode == RHS->isImplicitCode() &&
+ AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank();
}
unsigned getHashValue() const {
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
// Hashing AtomGroup and AtomRank substantially impacts performance whether
// Key Instructions is enabled or not. We can't detect whether it's enabled
// here cheaply; avoiding hashing zero values is a good approximation. This
@@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
if (AtomGroup || AtomRank)
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode,
AtomGroup, (uint8_t)AtomRank);
-#endif
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode);
}
};
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index b1b5f67..d24263f 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -270,6 +270,18 @@ void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
I.setMetadata(LLVMContext::MD_prof, BranchWeights);
}
+SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights,
+ std::optional<uint64_t> KnownMaxCount) {
+ uint64_t MaxCount = KnownMaxCount.has_value() ? KnownMaxCount.value()
+ : *llvm::max_element(Weights);
+ assert(MaxCount > 0 && "Bad max count");
+ uint64_t Scale = calculateCountScale(MaxCount);
+ SmallVector<uint32_t> DownscaledWeights;
+ for (const auto &ECI : Weights)
+ DownscaledWeights.push_back(scaleBranchCount(ECI, Scale));
+ return DownscaledWeights;
+}
+
void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
assert(T != 0 && "Caller should guarantee");
auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index a8e6c79..3c324f2 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -9,6 +9,8 @@
#include "llvm/IR/RuntimeLibcalls.h"
#include "llvm/ADT/StringTable.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/xxhash.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
#define DEBUG_TYPE "runtime-libcalls-info"
@@ -17,51 +19,11 @@ using namespace RTLIB;
#define GET_INIT_RUNTIME_LIBCALL_NAMES
#define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
+#define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
#include "llvm/IR/RuntimeLibcalls.inc"
#undef GET_INIT_RUNTIME_LIBCALL_NAMES
#undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
-
-static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
- FloatABI::ABIType FloatABIType,
- EABI EABIVersion) {
- static const RTLIB::LibcallImpl AAPCS_Libcalls[] = {
- RTLIB::__aeabi_dadd, RTLIB::__aeabi_ddiv,
- RTLIB::__aeabi_dmul, RTLIB::__aeabi_dsub,
- RTLIB::__aeabi_dcmpeq__oeq, RTLIB::__aeabi_dcmpeq__une,
- RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple,
- RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt,
- RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd,
- RTLIB::__aeabi_fdiv, RTLIB::__aeabi_fmul,
- RTLIB::__aeabi_fsub, RTLIB::__aeabi_fcmpeq__oeq,
- RTLIB::__aeabi_fcmpeq__une, RTLIB::__aeabi_fcmplt,
- RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge,
- RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun,
- RTLIB::__aeabi_d2iz, RTLIB::__aeabi_d2uiz,
- RTLIB::__aeabi_d2lz, RTLIB::__aeabi_d2ulz,
- RTLIB::__aeabi_f2iz, RTLIB::__aeabi_f2uiz,
- RTLIB::__aeabi_f2lz, RTLIB::__aeabi_f2ulz,
- RTLIB::__aeabi_d2f, RTLIB::__aeabi_d2h,
- RTLIB::__aeabi_f2d, RTLIB::__aeabi_i2d,
- RTLIB::__aeabi_ui2d, RTLIB::__aeabi_l2d,
- RTLIB::__aeabi_ul2d, RTLIB::__aeabi_i2f,
- RTLIB::__aeabi_ui2f, RTLIB::__aeabi_l2f,
- RTLIB::__aeabi_ul2f, RTLIB::__aeabi_lmul,
- RTLIB::__aeabi_llsl, RTLIB::__aeabi_llsr,
- RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv,
- RTLIB::__aeabi_idivmod, RTLIB::__aeabi_uidivmod,
- RTLIB::__aeabi_ldivmod, RTLIB::__aeabi_uidiv,
- RTLIB::__aeabi_uldivmod, RTLIB::__aeabi_f2h,
- RTLIB::__aeabi_d2h, RTLIB::__aeabi_h2f,
- RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove,
- RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4,
- RTLIB::__aeabi_memcpy8, RTLIB::__aeabi_memmove4,
- RTLIB::__aeabi_memmove8, RTLIB::__aeabi_memset4,
- RTLIB::__aeabi_memset8, RTLIB::__aeabi_memclr,
- RTLIB::__aeabi_memclr4, RTLIB::__aeabi_memclr8};
-
- for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls)
- Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
-}
+#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
/// Set default libcall names. If a target wants to opt-out of a libcall it
/// should be placed here.
@@ -69,65 +31,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
ExceptionHandling ExceptionModel,
FloatABI::ABIType FloatABI,
EABI EABIVersion, StringRef ABIName) {
- setTargetRuntimeLibcallSets(TT, FloatABI);
-
- if (ExceptionModel == ExceptionHandling::SjLj)
- setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume);
+ setTargetRuntimeLibcallSets(TT, ExceptionModel, FloatABI, EABIVersion,
+ ABIName);
if (TT.isARM() || TT.isThumb()) {
- setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
- return;
- }
+ // The half <-> float conversion functions are always soft-float on
+ // non-watchos platforms, but are needed for some targets which use a
+ // hard-float calling convention by default.
+ if (!TT.isWatchABI()) {
+ if (isAAPCS_ABI(TT, ABIName)) {
+ setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
+ setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
+ setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
+ } else {
+ setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
+ setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
+ setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
+ }
+ }
- if (TT.getArch() == Triple::ArchType::msp430) {
- setLibcallImplCallingConv(RTLIB::__mspabi_mpyll,
- CallingConv::MSP430_BUILTIN);
+ return;
}
}
-RTLIB::LibcallImpl
-RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const {
- const ArrayRef<uint16_t> RuntimeLibcallNameOffsets(
- RuntimeLibcallNameOffsetTable);
-
- iterator_range<ArrayRef<uint16_t>::const_iterator> Range =
- getRecognizedLibcallImpls(FuncName);
-
- for (auto I = Range.begin(); I != Range.end(); ++I) {
- RTLIB::LibcallImpl Impl =
- static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin());
-
- // FIXME: This should not depend on looking up ImplToLibcall, only the list
- // of libcalls for the module.
- RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
- if (Recognized != RTLIB::Unsupported)
- return Recognized;
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+iota_range<RTLIB::LibcallImpl>
+RuntimeLibcallsInfo::libcallImplNameHit(uint16_t NameOffsetEntry,
+ uint16_t StrOffset) {
+ int NumAliases = 1;
+ for (uint16_t Entry : ArrayRef(RuntimeLibcallNameOffsetTable)
+ .drop_front(NameOffsetEntry + 1)) {
+ if (Entry != StrOffset)
+ break;
+ ++NumAliases;
}
- return RTLIB::Unsupported;
+ RTLIB::LibcallImpl ImplStart = static_cast<RTLIB::LibcallImpl>(
+ &RuntimeLibcallNameOffsetTable[NameOffsetEntry] -
+ &RuntimeLibcallNameOffsetTable[0]);
+ return enum_seq(ImplStart,
+ static_cast<RTLIB::LibcallImpl>(ImplStart + NumAliases));
}
-iterator_range<ArrayRef<uint16_t>::const_iterator>
-RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
- StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName);
- if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName)
- return iterator_range(ArrayRef<uint16_t>());
-
- uint16_t IndexVal = It.offset().value();
- const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable);
-
- ArrayRef<uint16_t>::const_iterator E = TableRef.end();
- ArrayRef<uint16_t>::const_iterator EntriesBegin =
- std::lower_bound(TableRef.begin(), E, IndexVal);
- ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin;
-
- while (EntriesEnd != E && *EntriesEnd == IndexVal)
- ++EntriesEnd;
-
- assert(EntriesBegin != E &&
- "libcall found in name table but not offset table");
-
- return make_range(EntriesBegin, EntriesEnd);
+bool RuntimeLibcallsInfo::isAAPCS_ABI(const Triple &TT, StringRef ABIName) {
+ const ARM::ARMABI TargetABI = ARM::computeTargetABI(TT, ABIName);
+ return TargetABI == ARM::ARM_ABI_AAPCS || TargetABI == ARM::ARM_ABI_AAPCS16;
}
bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 129ca4a..5928c89 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets(
// means when we construct GEPOffset, we need to use the size
// of GEP's pointer type rather than the size of the original
// pointer type.
- unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType());
- if (CurBitWidth == BitWidth) {
- if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis))
- return V;
- } else {
- APInt GEPOffset(CurBitWidth, 0);
- if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
- return V;
+ APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
+ return V;
- // Stop traversal if the pointer offset wouldn't fit in the bit-width
- // provided by the Offset argument. This can happen due to AddrSpaceCast
- // stripping.
- if (GEPOffset.getSignificantBits() > BitWidth)
- return V;
+ // Stop traversal if the pointer offset wouldn't fit in the bit-width
+ // provided by the Offset argument. This can happen due to AddrSpaceCast
+ // stripping.
+ if (GEPOffset.getSignificantBits() > BitWidth)
+ return V;
- // External Analysis can return a result higher/lower than the value
- // represents. We need to detect overflow/underflow.
- APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
- if (!ExternalAnalysis) {
- Offset += GEPOffsetST;
- } else {
- bool Overflow = false;
- APInt OldOffset = Offset;
- Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
- if (Overflow) {
- Offset = OldOffset;
- return V;
- }
+ // External Analysis can return a result higher/lower than the value
+ // represents. We need to detect overflow/underflow.
+ APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+ if (!ExternalAnalysis) {
+ Offset += GEPOffsetST;
+ } else {
+ bool Overflow = false;
+ APInt OldOffset = Offset;
+ Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+ if (Overflow) {
+ Offset = OldOffset;
+ return V;
}
}
V = GEP->getPointerOperand();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ca3f148..9d9b51d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -566,6 +566,8 @@ private:
void visitUIToFPInst(UIToFPInst &I);
void visitSIToFPInst(SIToFPInst &I);
void visitIntToPtrInst(IntToPtrInst &I);
+ void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V);
+ void visitPtrToAddrInst(PtrToAddrInst &I);
void visitPtrToIntInst(PtrToIntInst &I);
void visitBitCastInst(BitCastInst &I);
void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
&GV);
Check(GV.getInitializer()->getType()->isSized(),
"Global variable initializer must be sized", &GV);
+ visitConstantExprsRecursively(GV.getInitializer());
// If the global has common linkage, it must have a zero initializer and
// cannot be constant.
if (GV.hasCommonLinkage()) {
@@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
CE->getType()),
"Invalid bitcast", CE);
+ else if (CE->getOpcode() == Instruction::PtrToAddr)
+ checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE);
}
void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
@@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
visitInstruction(I);
}
+void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) {
+ Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V);
+ Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V);
+ Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch",
+ V);
+
+ if (SrcTy->isVectorTy()) {
+ auto *VSrc = cast<VectorType>(SrcTy);
+ auto *VDest = cast<VectorType>(DestTy);
+ Check(VSrc->getElementCount() == VDest->getElementCount(),
+ "PtrToAddr vector length mismatch", V);
+ }
+
+ Type *AddrTy = DL.getAddressType(SrcTy);
+ Check(AddrTy == DestTy, "PtrToAddr result must be address width", V);
+}
+
+void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) {
+ checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I);
+ visitInstruction(I);
+}
+
void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
// Get the source and destination types
Type *SrcTy = I.getOperand(0)->getType();
@@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "PtrToInt Vector width mismatch", &I);
+ "PtrToInt Vector length mismatch", &I);
}
visitInstruction(I);
@@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "IntToPtr Vector width mismatch", &I);
+ "IntToPtr Vector length mismatch", &I);
}
visitInstruction(I);
}
@@ -4609,7 +4636,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
}
// The edge may exit from zero or more nested pads.
- SmallSet<Value *, 8> Seen;
+ SmallPtrSet<Value *, 8> Seen;
for (;; FromPad = getParentPad(FromPad)) {
Check(FromPad != ToPad,
"EH pad cannot handle exceptions raised within it", FromPad, TI);
@@ -4737,7 +4764,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
User *FirstUser = nullptr;
Value *FirstUnwindPad = nullptr;
SmallVector<FuncletPadInst *, 8> Worklist({&FPI});
- SmallSet<FuncletPadInst *, 8> Seen;
+ SmallPtrSet<FuncletPadInst *, 8> Seen;
while (!Worklist.empty()) {
FuncletPadInst *CurrentPad = Worklist.pop_back_val();
@@ -6612,6 +6639,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"Value for inactive lanes must be a VGPR function argument", &Call);
break;
}
+ case Intrinsic::amdgcn_call_whole_wave: {
+ auto F = dyn_cast<Function>(Call.getArgOperand(0));
+ Check(F, "Indirect whole wave calls are not allowed", &Call);
+
+ CallingConv::ID CC = F->getCallingConv();
+ Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
+ "Callee must have the amdgpu_gfx_whole_wave calling convention",
+ &Call);
+
+ Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
+
+ Check(Call.arg_size() == F->arg_size(),
+ "Call argument count must match callee argument count", &Call);
+
+ // The first argument of the call is the callee, and the first argument of
+ // the callee is the active mask. The rest of the arguments must match.
+ Check(F->arg_begin()->getType()->isIntegerTy(1),
+ "Callee must have i1 as its first argument", &Call);
+ for (auto [CallArg, FuncArg] :
+ drop_begin(zip_equal(Call.args(), F->args()))) {
+ Check(CallArg->getType() == FuncArg.getType(),
+ "Argument types must match", &Call);
+
+ // Check that inreg attributes match between call site and function
+ Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
+ FuncArg.hasInRegAttr(),
+ "Argument inreg attributes must match", &Call);
+ }
+ break;
+ }
case Intrinsic::amdgcn_s_prefetch_data: {
Check(
AMDGPU::isFlatGlobalAddrSpace(
@@ -6770,7 +6827,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- Value *Ptr = Call.getArgOperand(1);
+ Value *Ptr = Call.getArgOperand(0);
Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr),
"llvm.lifetime.start/end can only be used on alloca or poison",
&Call);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 0323b4d..35d24c1 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1422,7 +1422,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
for (RTLIB::LibcallImpl Impl : LibcallImpls) {
if (Impl != RTLIB::Unsupported)
- LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl));
+ LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
}
return LibcallSymbols;
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a466ce5..d6c15de 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1133,8 +1133,11 @@ void IRLinker::linkNamedMDNodes() {
NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
// Add Src elements into Dest node.
- for (const MDNode *Op : NMD.operands())
- DestNMD->addOperand(Mapper.mapMDNode(*Op));
+ for (const MDNode *Op : NMD.operands()) {
+ MDNode *MD = Mapper.mapMDNode(*Op);
+ if (!is_contained(DestNMD->operands(), MD))
+ DestNMD->addOperand(MD);
+ }
}
}
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 18a85b3..1e1d0a6 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_component_library(LLVMMC
MCSection.cpp
MCSectionMachO.cpp
MCStreamer.cpp
+ MCSFrame.cpp
MCSPIRVStreamer.cpp
MCSubtargetInfo.cpp
MCSymbol.cpp
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 8f3814a..759d3e0 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -541,12 +541,12 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) {
if (Symbol.isAbsolute()) {
MSD.SectionIndex = ELF::SHN_ABS;
} else if (Symbol.isCommon()) {
- if (Symbol.isTargetCommon()) {
- MSD.SectionIndex = Symbol.getIndex();
- } else {
+ auto Shndx = Symbol.getIndex();
+ if (!Shndx) {
assert(!Local);
- MSD.SectionIndex = ELF::SHN_COMMON;
+ Shndx = ELF::SHN_COMMON;
}
+ MSD.SectionIndex = Shndx;
} else if (Symbol.isUndefined()) {
if (Symbol.isSignature() && !Symbol.isUsedInReloc()) {
MSD.SectionIndex = RevGroupMap.lookup(&Symbol);
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 3b629cd..d68f4af 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -17,7 +17,6 @@
#include "llvm/MC/MCSectionGOFF.h"
#include "llvm/MC/MCSymbolGOFF.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ConvertEBCDIC.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Endian.h"
diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp
index 0a5d1927..092736b 100644
--- a/llvm/lib/MC/MCAsmInfoGOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp
@@ -62,6 +62,8 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
OS << ',';
OS << "RMODE(";
switch (Rmode) {
+ case GOFF::ESD_RMODE_None:
+ llvm_unreachable("");
case GOFF::ESD_RMODE_24:
OS << "24";
break;
@@ -71,8 +73,6 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
case GOFF::ESD_RMODE_64:
OS << "64";
break;
- case GOFF::ESD_RMODE_None:
- break;
}
OS << ')';
}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 9a5e070..89e541a 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -28,7 +28,6 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/FormattedStream.h"
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 5e364e9..1d211a1 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -42,7 +42,6 @@
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 275e76e..2881d7c 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -29,7 +29,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LEB128.h"
#include <cassert>
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 6cbdf74..21da79b 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -68,6 +68,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
OS << "\n Fixup @" << F.getOffset() << " Value:";
F.getValue()->print(OS, nullptr);
OS << " Kind:" << F.getKind();
+ if (F.isLinkerRelaxable())
+ OS << " LinkerRelaxable";
}
};
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 4ac73ab..d505ac6 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -10,6 +10,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/SFrame.h"
#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@@ -23,7 +24,6 @@
#include "llvm/MC/MCSectionSPIRV.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSectionXCOFF.h"
-#include "llvm/Support/Casting.h"
#include "llvm/TargetParser/Triple.h"
using namespace llvm;
@@ -380,6 +380,19 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
unsigned EHSectionType = T.getArch() == Triple::x86_64
? ELF::SHT_X86_64_UNWIND
: ELF::SHT_PROGBITS;
+ switch (T.getArch()) {
+ case Triple::x86_64:
+ SFrameABIArch = sframe::ABI::AMD64EndianLittle;
+ break;
+ case Triple::aarch64:
+ SFrameABIArch = sframe::ABI::AArch64EndianLittle;
+ break;
+ case Triple::aarch64_be:
+ SFrameABIArch = sframe::ABI::AArch64EndianBig;
+ break;
+ default:
+ break;
+ }
// Solaris requires different flags for .eh_frame to seemingly every other
// platform.
@@ -537,6 +550,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
EHFrameSection =
Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
+ SFrameSection =
+ Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
+
CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0);
StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
@@ -1064,6 +1080,7 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
CompactUnwindDwarfEHFrameOnly = 0;
EHFrameSection = nullptr; // Created on demand.
+ SFrameSection = nullptr; // Created on demand.
CompactUnwindSection = nullptr; // Used only by selected targets.
DwarfAccelNamesSection = nullptr; // Used only by selected targets.
DwarfAccelObjCSection = nullptr; // Used only by selected targets.
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d0c6144..59265bc 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSFrame.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
@@ -30,7 +31,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
: MCStreamer(Context),
Assembler(std::make_unique<MCAssembler>(
Context, std::move(TAB), std::move(Emitter), std::move(OW))),
- EmitEHFrame(true), EmitDebugFrame(false) {
+ EmitEHFrame(true), EmitDebugFrame(false), EmitSFrame(false) {
assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
IsObj = true;
setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
@@ -186,6 +187,10 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
if (EmitDebugFrame)
MCDwarfFrameEmitter::Emit(*this, MAB, false);
+
+ if (EmitSFrame || (getContext().getTargetOptions() &&
+ getContext().getTargetOptions()->EmitSFrameUnwind))
+ MCSFrameEmitter::emit(*this);
}
void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
@@ -461,11 +466,23 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
F->Kind = MCFragment::FT_Relaxable;
- F->STI = &STI;
- F->HasInstructions = true;
+ F->setHasInstructions(STI);
+
F->setVarContents(Data);
- F->setVarFixups(Fixups);
F->setInst(Inst);
+
+ bool MarkedLinkerRelaxable = false;
+ for (auto &Fixup : Fixups) {
+ if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable)
+ continue;
+ MarkedLinkerRelaxable = true;
+ auto *Sec = F->getParent();
+ if (!Sec->isLinkerRelaxable())
+ Sec->setFirstLinkerRelaxable(F->getLayoutOrder());
+ F->setLinkerRelaxable();
+ }
+ F->setVarFixups(Fixups);
+
newFragment();
}
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 229b0b8..1bb617b 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -18,7 +18,6 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbolCOFF.h"
#include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/SMLoc.h"
#include <cstdint>
#include <utility>
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 6782c4b..513f3b3 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -22,7 +22,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/SMLoc.h"
#include <cassert>
#include <cstdint>
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 6c2d241..ddfe1e1 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -26,7 +26,6 @@
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/Support/Casting.h"
#include <optional>
using namespace llvm;
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
new file mode 100644
index 0000000..447f22e
--- /dev/null
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -0,0 +1,98 @@
+//===- lib/MC/MCSFrame.cpp - MCSFrame implementation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSFrame.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+using namespace sframe;
+
+namespace {
+
+// Emitting these field-by-field, instead of constructing the actual structures
+// lets Streamer do target endian-fixups for free.
+
+class SFrameEmitterImpl {
+ MCObjectStreamer &Streamer;
+ ABI SFrameABI;
+ MCSymbol *FDESubSectionStart;
+ MCSymbol *FRESubSectionStart;
+ MCSymbol *FRESubSectionEnd;
+
+public:
+ SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) {
+ assert(Streamer.getContext()
+ .getObjectFileInfo()
+ ->getSFrameABIArch()
+ .has_value());
+ SFrameABI = *Streamer.getContext().getObjectFileInfo()->getSFrameABIArch();
+ FDESubSectionStart = Streamer.getContext().createTempSymbol();
+ FRESubSectionStart = Streamer.getContext().createTempSymbol();
+ FRESubSectionEnd = Streamer.getContext().createTempSymbol();
+ }
+
+ void emitPreamble() {
+ Streamer.emitInt16(Magic);
+ Streamer.emitInt8(static_cast<uint8_t>(Version::V2));
+ Streamer.emitInt8(0);
+ }
+
+ void emitHeader() {
+ emitPreamble();
+ // sfh_abi_arch
+ Streamer.emitInt8(static_cast<uint8_t>(SFrameABI));
+ // sfh_cfa_fixed_fp_offset
+ Streamer.emitInt8(0);
+ // sfh_cfa_fixed_ra_offset
+ Streamer.emitInt8(0);
+ // sfh_auxhdr_len
+ Streamer.emitInt8(0);
+ // shf_num_fdes
+ Streamer.emitInt32(0);
+ // shf_num_fres
+ Streamer.emitInt32(0);
+ // shf_fre_len
+ Streamer.emitAbsoluteSymbolDiff(FRESubSectionEnd, FRESubSectionStart,
+ sizeof(int32_t));
+ // shf_fdeoff. With no sfh_auxhdr, these immediately follow this header.
+ Streamer.emitInt32(0);
+ // shf_freoff
+ Streamer.emitAbsoluteSymbolDiff(FRESubSectionStart, FDESubSectionStart,
+ sizeof(uint32_t));
+ }
+
+ void emitFDEs() { Streamer.emitLabel(FDESubSectionStart); }
+
+ void emitFREs() {
+ Streamer.emitLabel(FRESubSectionStart);
+ Streamer.emitLabel(FRESubSectionEnd);
+ }
+};
+
+} // end anonymous namespace
+
+void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) {
+ MCContext &Context = Streamer.getContext();
+ SFrameEmitterImpl Emitter(Streamer);
+
+ MCSection *Section = Context.getObjectFileInfo()->getSFrameSection();
+ // Not strictly necessary, but gas always aligns to 8, so match that.
+ Section->ensureMinAlignment(Align(8));
+ Streamer.switchSection(Section);
+ MCSymbol *SectionStart = Context.createTempSymbol();
+ Streamer.emitLabel(SectionStart);
+ Emitter.emitHeader();
+ Emitter.emitFDEs();
+ Emitter.emitFREs();
+}
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 9ed6fd1..a668e79 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -39,6 +39,8 @@ LLVM_DUMP_METHOD void MCSection::dump(
raw_ostream &OS = errs();
OS << "MCSection Name:" << getName();
+ if (isLinkerRelaxable())
+ OS << " FirstLinkerRelaxable:" << firstLinkerRelaxable();
for (auto &F : *this) {
OS << '\n';
F.dump();
diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp
index 8192896..b868738 100644
--- a/llvm/lib/MC/MCSymbol.cpp
+++ b/llvm/lib/MC/MCSymbol.cpp
@@ -20,6 +20,10 @@
using namespace llvm;
+// There are numerous MCSymbol objects, so keeping sizeof(MCSymbol) small is
+// crucial for minimizing peak memory usage.
+static_assert(sizeof(MCSymbol) <= 24, "Keep the base symbol small");
+
// Only the address of this fragment is ever actually used.
static MCFragment SentinelFragment;
@@ -44,13 +48,12 @@ void *MCSymbol::operator new(size_t s, const MCSymbolTableEntry *Name,
}
void MCSymbol::setVariableValue(const MCExpr *Value) {
- assert(Value && "Invalid variable value!");
- assert((SymbolContents == SymContentsUnset ||
- SymbolContents == SymContentsVariable) &&
- "Cannot give common/offset symbol a variable value");
+ assert(Value && "Invalid equated expression");
+ assert((kind == Kind::Regular || kind == Kind::Equated) &&
+ "Cannot equate a common symbol");
this->Value = Value;
- SymbolContents = SymContentsVariable;
- setUndefined();
+ kind = Kind::Equated;
+ Fragment = nullptr;
}
void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 9c8b224..070b3d9 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -22,7 +22,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
namespace llvm {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index a45936b..2e632de 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -30,7 +30,6 @@
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCWinCOFFObjectWriter.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index a0e3dba..684e05a 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -21,7 +21,6 @@
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/MC/MCXCOFFObjectWriter.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
using namespace llvm;
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 13917ba..fce6b2a 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -20,7 +20,6 @@
#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCXCOFFObjectWriter.h"
#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
diff --git a/llvm/lib/MCA/Instruction.cpp b/llvm/lib/MCA/Instruction.cpp
index d4adfce..7966708 100644
--- a/llvm/lib/MCA/Instruction.cpp
+++ b/llvm/lib/MCA/Instruction.cpp
@@ -128,6 +128,13 @@ void WriteState::dump() const {
}
#endif
+#ifndef NDEBUG
+void ReadState::dump() const {
+ dbgs() << "{ OpIdx=" << RD->OpIndex << ", RegID " << getRegisterID()
+ << ", Cycles Left=" << CyclesLeft << " }";
+}
+#endif
+
const CriticalDependency &Instruction::computeCriticalRegDep() {
if (CriticalRegDep.Cycles)
return CriticalRegDep;
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 0f19495..0043f02 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool> DisableBitcodeVersionUpgrade(
"disable-bitcode-version-upgrade", cl::Hidden,
cl::desc("Disable automatic bitcode upgrade for version mismatch"));
-static const char *PreservedSymbols[] = {
+static constexpr StringLiteral PreservedSymbols[] = {
// There are global variables, so put it here instead of in
// RuntimeLibcalls.td.
// TODO: Are there similar such variables?
@@ -54,6 +54,10 @@ static const char *PreservedSymbols[] = {
"__stack_chk_guard",
};
+static bool isPreservedGlobalVarName(StringRef Name) {
+ return PreservedSymbols[0] == Name || PreservedSymbols[1] == Name;
+}
+
namespace {
const char *getExpectedProducerName() {
@@ -81,12 +85,16 @@ struct Builder {
// The StringTableBuilder does not create a copy of any strings added to it,
// so this provides somewhere to store any strings that we create.
Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
- BumpPtrAllocator &Alloc)
- : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
+ BumpPtrAllocator &Alloc, const Triple &TT)
+ : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT),
+ Libcalls(TT) {}
DenseMap<const Comdat *, int> ComdatMap;
Mangler Mang;
- Triple TT;
+ const Triple &TT;
+
+ // FIXME: This shouldn't be here.
+ RTLIB::RuntimeLibcallsInfo Libcalls;
std::vector<storage::Comdat> Comdats;
std::vector<storage::Module> Mods;
@@ -98,6 +106,10 @@ struct Builder {
std::vector<storage::Str> DependentLibraries;
+ bool isPreservedLibFuncName(StringRef Name) {
+ return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported;
+ }
+
void setStr(storage::Str &S, StringRef Value) {
S.Offset = StrtabBuilder.add(Value);
S.Size = Value.size();
@@ -213,19 +225,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
return P.first->second;
}
-static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
- StringSet<> PreservedSymbolSet;
- PreservedSymbolSet.insert(std::begin(PreservedSymbols),
- std::end(PreservedSymbols));
- // FIXME: Do we need to pass in ABI fields from TargetOptions?
- RTLIB::RuntimeLibcallsInfo Libcalls(TT);
- for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
- if (Impl != RTLIB::Unsupported)
- PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl));
- }
- return PreservedSymbolSet;
-}
-
Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
const SmallPtrSet<GlobalValue *, 4> &Used,
ModuleSymbolTable::Symbol Msym) {
@@ -279,13 +278,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
return Error::success();
}
- setStr(Sym.IRName, GV->getName());
-
- static const StringSet<> PreservedSymbolsSet =
- buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
- bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
+ StringRef GVName = GV->getName();
+ setStr(Sym.IRName, GVName);
- if (Used.count(GV) || IsPreservedSymbol)
+ if (Used.count(GV) || isPreservedLibFuncName(GVName) ||
+ isPreservedGlobalVarName(GVName))
Sym.Flags |= 1 << storage::Symbol::FB_used;
if (GV->isThreadLocal())
Sym.Flags |= 1 << storage::Symbol::FB_tls;
@@ -352,7 +349,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
setStr(Hdr.Producer, kExpectedProducerName);
setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str());
setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
- TT = IRMods[0]->getTargetTriple();
for (auto *M : IRMods)
if (Error Err = addModule(M))
@@ -378,7 +374,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
StringTableBuilder &StrtabBuilder,
BumpPtrAllocator &Alloc) {
- return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
+ const Triple &TT = Mods[0]->getTargetTriple();
+ return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods);
}
// Upgrade a vector of bitcode modules created by an old version of LLVM by
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
index 5863490..759b579 100644
--- a/llvm/lib/Object/SFrameParser.cpp
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -32,14 +32,25 @@ getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) {
}
template <typename T>
-static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
- uint64_t Offset) {
+static Expected<ArrayRef<T>>
+getDataSliceAsArrayOf(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Count) {
static_assert(std::is_trivial_v<T>);
- Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T));
+ Expected<ArrayRef<uint8_t>> Slice =
+ getDataSlice(Data, Offset, sizeof(T) * Count);
if (!Slice)
return Slice.takeError();
- return *reinterpret_cast<const T *>(Slice->data());
+ return ArrayRef(reinterpret_cast<const T *>(Slice->data()), Count);
+}
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+ uint64_t Offset) {
+ Expected<ArrayRef<T>> Array = getDataSliceAsArrayOf<T>(Data, Offset, 1);
+ if (!Array)
+ return Array.takeError();
+
+ return Array->front();
}
template <endianness E>
@@ -87,17 +98,134 @@ uint64_t SFrameParser<E>::getAbsoluteStartAddress(
uint64_t Result = SectionAddress + FDE->StartAddress;
if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) ==
- sframe::Flags::FDEFuncStartPCRel) {
- uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
- uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+ sframe::Flags::FDEFuncStartPCRel)
+ Result += offsetOf(FDE);
+
+ return Result;
+}
- assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
- "Iterator does not belong to this object!");
+template <endianness E>
+uint64_t SFrameParser<E>::offsetOf(typename FDERange::iterator FDE) const {
+ uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data());
+ uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE);
+
+ assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() &&
+ "Iterator does not belong to this object!");
+ return FDEPtr - DataPtr;
+}
- Result += FDEPtr - DataPtr;
+template <typename EndianT>
+static Error readArray(ArrayRef<uint8_t> Data, uint64_t Count, uint64_t &Offset,
+ SmallVectorImpl<int32_t> &Vec) {
+ Expected<ArrayRef<EndianT>> RawArray =
+ getDataSliceAsArrayOf<EndianT>(Data, Offset, Count);
+ if (!RawArray)
+ return RawArray.takeError();
+ Offset += Count * sizeof(EndianT);
+ Vec.resize(Count);
+ llvm::copy(*RawArray, Vec.begin());
+ return Error::success();
+}
+
+template <typename T, endianness E>
+static Error readFRE(ArrayRef<uint8_t> Data, uint64_t &Offset,
+ typename SFrameParser<E>::FrameRowEntry &FRE) {
+ Expected<sframe::FrameRowEntry<T, E>> RawFRE =
+ getDataSliceAs<sframe::FrameRowEntry<T, E>>(Data, Offset);
+ if (!RawFRE)
+ return RawFRE.takeError();
+
+ Offset += sizeof(*RawFRE);
+ FRE.StartAddress = RawFRE->StartAddress;
+ FRE.Info.Info = RawFRE->Info.Info;
+
+ switch (FRE.Info.getOffsetSize()) {
+ case sframe::FREOffset::B1:
+ return readArray<sframe::detail::packed<int8_t, E>>(
+ Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
+ case sframe::FREOffset::B2:
+ return readArray<sframe::detail::packed<int16_t, E>>(
+ Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
+ case sframe::FREOffset::B4:
+ return readArray<sframe::detail::packed<int32_t, E>>(
+ Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets);
}
+ return createError(formatv("unsupported FRE offset size {0} at offset {1:x+}",
+ static_cast<unsigned>(FRE.Info.getOffsetSize()),
+ Offset));
+}
- return Result;
+template <endianness E> Error SFrameParser<E>::FallibleFREIterator::inc() {
+ if (++Idx == Size)
+ return Error::success();
+
+ switch (FREType) {
+ case sframe::FREType::Addr1:
+ return readFRE<uint8_t, E>(Data, Offset, FRE);
+ case sframe::FREType::Addr2:
+ return readFRE<uint16_t, E>(Data, Offset, FRE);
+ case sframe::FREType::Addr4:
+ return readFRE<uint32_t, E>(Data, Offset, FRE);
+ }
+ return createError(formatv("unsupported FRE type {0} at offset {1:x+}",
+ static_cast<unsigned>(FREType), Offset));
+}
+
+template <endianness E>
+iterator_range<typename SFrameParser<E>::fre_iterator>
+SFrameParser<E>::fres(const sframe::FuncDescEntry<E> &FDE, Error &Err) const {
+ uint64_t Offset = getFREBase() + FDE.StartFREOff;
+ fre_iterator BeforeBegin = make_fallible_itr(
+ FallibleFREIterator(Data, FDE.Info.getFREType(), -1, FDE.NumFREs, Offset),
+ Err);
+ fre_iterator End = make_fallible_end(
+ FallibleFREIterator(Data, FDE.Info.getFREType(), FDE.NumFREs, FDE.NumFREs,
+ /*Offset=*/0));
+ return {++BeforeBegin, End};
+}
+
+static std::optional<int32_t> getOffset(ArrayRef<int32_t> Offsets, size_t Idx) {
+ if (Offsets.size() > Idx)
+ return Offsets[Idx];
+ return std::nullopt;
+}
+
+// The interpretation of offsets is ABI-specific. The implementation of this and
+// the following functions may need to be adjusted when adding support for a new
+// ABI.
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getCFAOffset(const FrameRowEntry &FRE) const {
+ return getOffset(FRE.Offsets, 0);
+}
+
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getRAOffset(const FrameRowEntry &FRE) const {
+ if (usesFixedRAOffset())
+ return Header.CFAFixedRAOffset;
+ return getOffset(FRE.Offsets, 1);
+}
+
+template <endianness E>
+std::optional<int32_t>
+SFrameParser<E>::getFPOffset(const FrameRowEntry &FRE) const {
+ if (usesFixedFPOffset())
+ return Header.CFAFixedFPOffset;
+ return getOffset(FRE.Offsets, usesFixedRAOffset() ? 1 : 2);
+}
+
+template <endianness E>
+ArrayRef<int32_t>
+SFrameParser<E>::getExtraOffsets(const FrameRowEntry &FRE) const {
+ size_t UsedOffsets = 1; // CFA
+ if (!usesFixedRAOffset())
+ ++UsedOffsets;
+ if (!usesFixedFPOffset())
+ ++UsedOffsets;
+ if (FRE.Offsets.size() > UsedOffsets)
+ return ArrayRef<int32_t>(FRE.Offsets).drop_front(UsedOffsets);
+ return {};
}
template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f810368..b7edeea 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -124,7 +124,6 @@
#include "llvm/CodeGen/MachineCopyPropagation.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineLICM.h"
#include "llvm/CodeGen/MachineLateInstrsCleanup.h"
#include "llvm/CodeGen/MachinePassManager.h"
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index cdf4412..fc2577e 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -519,7 +519,7 @@ Error InstrProfSymtab::create(SectionRef &Section) {
return Error::success();
}
-StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
+StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) const {
if (Pointer < Address)
return StringRef();
auto Offset = Pointer - Address;
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 5425729..7885e12 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -684,13 +684,13 @@ Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName,
return Error::success();
}
-uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) const {
// Given a runtime address, look up the hash value in the interval map, and
// fallback to value 0 if a hash value is not found.
return VTableAddrMap.lookup(Address, 0);
}
-uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) const {
finalizeSymtab();
auto It = partition_point(AddrToMD5Map, [=](std::pair<uint64_t, uint64_t> A) {
return A.first < Address;
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7ca26aa..a347351 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -13,7 +13,6 @@
#include "llvm/ProfileData/InstrProfWriter.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/ProfileSummary.h"
#include "llvm/ProfileData/DataAccessProf.h"
@@ -331,61 +330,34 @@ void InstrProfWriter::addDataAccessProfData(
DataAccessProfileData = std::move(DataAccessProfDataIn);
}
-void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
- assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
- assert(!Trace.FunctionNameRefs.empty());
- if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
- // Simply append the trace if we have not yet hit our reservoir size limit.
- TemporalProfTraces.push_back(std::move(Trace));
- } else {
- // Otherwise, replace a random trace in the stream.
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
- uint64_t RandomIndex = Distribution(RNG);
- if (RandomIndex < TemporalProfTraces.size())
- TemporalProfTraces[RandomIndex] = std::move(Trace);
- }
- ++TemporalProfTraceStreamSize;
-}
-
void InstrProfWriter::addTemporalProfileTraces(
SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+ if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize)
+ TemporalProfTraces.truncate(TemporalProfTraceReservoirSize);
for (auto &Trace : SrcTraces)
if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
- // Assume that the source has the same reservoir size as the destination to
- // avoid needing to record it in the indexed profile format.
- bool IsDestSampled =
- (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize);
- bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize);
- if (!IsDestSampled && IsSrcSampled) {
- // If one of the traces are sampled, ensure that it belongs to Dest.
- std::swap(TemporalProfTraces, SrcTraces);
- std::swap(TemporalProfTraceStreamSize, SrcStreamSize);
- std::swap(IsDestSampled, IsSrcSampled);
- }
- if (!IsSrcSampled) {
- // If the source stream is not sampled, we add each source trace normally.
- for (auto &Trace : SrcTraces)
- addTemporalProfileTrace(std::move(Trace));
+ // If there are no source traces, it is probably because
+ // --temporal-profile-max-trace-length=0 was set to deliberately remove all
+ // traces. In that case, we do not want to increase the stream size
+ if (SrcTraces.empty())
return;
- }
- // Otherwise, we find the traces that would have been removed if we added
- // the whole source stream.
- SmallSetVector<uint64_t, 8> IndicesToReplace;
- for (uint64_t I = 0; I < SrcStreamSize; I++) {
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
+ // Add traces until our reservoir is full or we run out of source traces
+ auto SrcTraceIt = SrcTraces.begin();
+ while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize &&
+ SrcTraceIt < SrcTraces.end())
+ TemporalProfTraces.push_back(*SrcTraceIt++);
+ // Our reservoir is full, we need to sample the source stream
+ llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG);
+ for (uint64_t I = TemporalProfTraces.size();
+ I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) {
+ std::uniform_int_distribution<uint64_t> Distribution(0, I);
uint64_t RandomIndex = Distribution(RNG);
if (RandomIndex < TemporalProfTraces.size())
- IndicesToReplace.insert(RandomIndex);
- ++TemporalProfTraceStreamSize;
+ TemporalProfTraces[RandomIndex] = *SrcTraceIt++;
}
- // Then we insert a random sample of the source traces.
- llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG);
- for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces))
- TemporalProfTraces[Index] = std::move(Trace);
+ TemporalProfTraceStreamSize += SrcStreamSize;
}
void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index fe34037..70ac68a 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
case llvm::Instruction::FPToUI:
case llvm::Instruction::FPToSI:
case llvm::Instruction::FPExt:
+ case llvm::Instruction::PtrToAddr:
case llvm::Instruction::PtrToInt:
case llvm::Instruction::IntToPtr:
case llvm::Instruction::SIToFP:
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 956047c..1a81d18 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
case Instruction::Opcode::FPExt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+ case Instruction::Opcode::PtrToAddr:
+ return static_cast<llvm::Instruction::CastOps>(
+ llvm::Instruction::PtrToAddr);
case Instruction::Opcode::PtrToInt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
case Instruction::Opcode::IntToPtr:
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 3d688a1..d2a417f 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -5519,13 +5519,129 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
return opOK;
}
+APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger(
+ MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned,
+ roundingMode RM, bool *IsExact) const {
+ assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+
+ // If Hi is not finite, or Lo is zero, the value is entirely represented
+ // by Hi. Delegate to the simpler single-APFloat conversion.
+ if (!getFirst().isFiniteNonZero() || getSecond().isZero())
+ return getFirst().convertToInteger(Input, Width, IsSigned, RM, IsExact);
+
+ // First, round the full double-double value to an integral value. This
+ // simplifies the rest of the function, as we no longer need to consider
+ // fractional parts.
+ *IsExact = false;
+ DoubleAPFloat Integral = *this;
+ const opStatus RoundStatus = Integral.roundToIntegral(RM);
+ if (RoundStatus == opInvalidOp)
+ return opInvalidOp;
+ const APFloat &IntegralHi = Integral.getFirst();
+ const APFloat &IntegralLo = Integral.getSecond();
+
+ // If rounding results in either component being zero, the sum is trivial.
+ // Delegate to the simpler single-APFloat conversion.
+ bool HiIsExact;
+ if (IntegralHi.isZero() || IntegralLo.isZero()) {
+ const opStatus HiStatus =
+ IntegralHi.convertToInteger(Input, Width, IsSigned, RM, &HiIsExact);
+ // The conversion from an integer-valued float to an APInt may fail if the
+ // result would be out of range. Regardless, taking this path is only
+ // possible if rounding occurred during the initial `roundToIntegral`.
+ return HiStatus == opOK ? opInexact : HiStatus;
+ }
+
+ // A negative number cannot be represented by an unsigned integer.
+ // Since a double-double is canonical, if Hi is negative, the sum is negative.
+ if (!IsSigned && IntegralHi.isNegative())
+ return opInvalidOp;
+
+ // Handle the special boundary case where |Hi| is exactly the power of two
+ // that marks the edge of the integer's range (e.g., 2^63 for int64_t). In
+ // this situation, Hi itself won't fit, but the sum Hi + Lo might.
+ // `PositiveOverflowWidth` is the bit number for this boundary (N-1 for
+ // signed, N for unsigned).
+ bool LoIsExact;
+ const int HiExactLog2 = IntegralHi.getExactLog2Abs();
+ const unsigned PositiveOverflowWidth = IsSigned ? Width - 1 : Width;
+ if (HiExactLog2 >= 0 &&
+ static_cast<unsigned>(HiExactLog2) == PositiveOverflowWidth) {
+ // If Hi and Lo have the same sign, |Hi + Lo| > |Hi|, so the sum is
+ // guaranteed to overflow. E.g., for uint128_t, (2^128, 1) overflows.
+ if (IntegralHi.isNegative() == IntegralLo.isNegative())
+ return opInvalidOp;
+
+ // If the signs differ, the sum will fit. We can compute the result using
+ // properties of two's complement arithmetic without a wide intermediate
+ // integer. E.g., for uint128_t, (2^128, -1) should be 2^128 - 1.
+ const opStatus LoStatus = IntegralLo.convertToInteger(
+ Input, Width, /*IsSigned=*/true, RM, &LoIsExact);
+ if (LoStatus == opInvalidOp)
+ return opInvalidOp;
+
+ // Adjust the bit pattern of Lo to account for Hi's value:
+ // - For unsigned (Hi=2^Width): `2^Width + Lo` in `Width`-bit
+ // arithmetic is equivalent to just `Lo`. The conversion of `Lo` above
+ // already produced the correct final bit pattern.
+ // - For signed (Hi=2^(Width-1)): The sum `2^(Width-1) + Lo` (where Lo<0)
+ // can be computed by taking the two's complement pattern for `Lo` and
+ // clearing the sign bit.
+ if (IsSigned && !IntegralHi.isNegative())
+ APInt::tcClearBit(Input.data(), PositiveOverflowWidth);
+ *IsExact = RoundStatus == opOK;
+ return RoundStatus;
+ }
+
+ // Convert Hi into an integer. This may not fit but that is OK: we know that
+ // Hi + Lo would not fit either in this situation.
+ const opStatus HiStatus = IntegralHi.convertToInteger(
+ Input, Width, IsSigned, rmTowardZero, &HiIsExact);
+ if (HiStatus == opInvalidOp)
+ return HiStatus;
+
+ // Convert Lo into a temporary integer of the same width.
+ APSInt LoResult{Width, /*isUnsigned=*/!IsSigned};
+ const opStatus LoStatus =
+ IntegralLo.convertToInteger(LoResult, rmTowardZero, &LoIsExact);
+ if (LoStatus == opInvalidOp)
+ return LoStatus;
+
+ // Add Lo to Hi. This addition is guaranteed not to overflow because of the
+ // double-double canonicalization rule (`|Lo| <= ulp(Hi)/2`). The only case
+ // where the sum could cross the integer type's boundary is when Hi is a
+ // power of two, which is handled by the special case block above.
+ APInt::tcAdd(Input.data(), LoResult.getRawData(), /*carry=*/0, Input.size());
+
+ *IsExact = RoundStatus == opOK;
+ return RoundStatus;
+}
+
APFloat::opStatus
DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
unsigned int Width, bool IsSigned,
roundingMode RM, bool *IsExact) const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
- .convertToInteger(Input, Width, IsSigned, RM, IsExact);
+ opStatus FS =
+ convertToSignExtendedInteger(Input, Width, IsSigned, RM, IsExact);
+
+ if (FS == opInvalidOp) {
+ const unsigned DstPartsCount = partCountForBits(Width);
+ assert(DstPartsCount <= Input.size() && "Integer too big");
+
+ unsigned Bits;
+ if (getCategory() == fcNaN)
+ Bits = 0;
+ else if (isNegative())
+ Bits = IsSigned;
+ else
+ Bits = Width - IsSigned;
+
+ tcSetLeastSignificantBits(Input.data(), DstPartsCount, Bits);
+ if (isNegative() && IsSigned)
+ APInt::tcShiftLeft(Input.data(), DstPartsCount, Width - 1);
+ }
+
+ return FS;
}
APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
@@ -5626,14 +5742,30 @@ bool DoubleAPFloat::getExactInverse(APFloat *inv) const {
return Ret;
}
-int DoubleAPFloat::getExactLog2() const {
- // TODO: Implement me
- return INT_MIN;
-}
-
int DoubleAPFloat::getExactLog2Abs() const {
- // TODO: Implement me
- return INT_MIN;
+ // In order for Hi + Lo to be a power of two, the following must be true:
+ // 1. Hi must be a power of two.
+ // 2. Lo must be zero.
+ if (getSecond().isNonZero())
+ return INT_MIN;
+ return getFirst().getExactLog2Abs();
+}
+
+int ilogb(const DoubleAPFloat &Arg) {
+ const APFloat &Hi = Arg.getFirst();
+ const APFloat &Lo = Arg.getSecond();
+ int IlogbResult = ilogb(Hi);
+ // Zero and non-finite values can delegate to ilogb(Hi).
+ if (Arg.getCategory() != fcNormal)
+ return IlogbResult;
+ // If Lo can't change the binade, we can delegate to ilogb(Hi).
+ if (Lo.isZero() || Hi.isNegative() == Lo.isNegative())
+ return IlogbResult;
+ if (Hi.getExactLog2Abs() == INT_MIN)
+ return IlogbResult;
+ // Numbers of the form 2^a - 2^b or -2^a + 2^b are almost powers of two but
+ // get nudged out of the binade by the low component.
+ return IlogbResult - 1;
}
DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
@@ -5646,10 +5778,101 @@ DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
APFloat::roundingMode RM) {
assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat First = frexp(Arg.Floats[0], Exp, RM);
- APFloat Second = Arg.Floats[1];
- if (Arg.getCategory() == APFloat::fcNormal)
- Second = scalbn(Second, -Exp, RM);
+
+ // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in
+ // [1.0, 2.0).
+ Exp = ilogb(Arg);
+
+ // For NaNs, quiet any signaling NaN and return the result, as per standard
+ // practice.
+ if (Exp == APFloat::IEK_NaN) {
+ DoubleAPFloat Quiet{Arg};
+ Quiet.getFirst().makeQuiet();
+ return Quiet;
+ }
+
+ // For infinity, return it unchanged. The exponent remains IEK_Inf.
+ if (Exp == APFloat::IEK_Inf)
+ return Arg;
+
+ // For zero, the fraction is zero and the standard requires the exponent be 0.
+ if (Exp == APFloat::IEK_Zero) {
+ Exp = 0;
+ return Arg;
+ }
+
+ const APFloat &Hi = Arg.getFirst();
+ const APFloat &Lo = Arg.getSecond();
+
+ // frexp requires the fraction's absolute value to be in [0.5, 1.0).
+ // ilogb provides an exponent for an absolute value in [1.0, 2.0).
+ // Increment the exponent to ensure the fraction is in the correct range.
+ ++Exp;
+
+ const bool SignsDisagree = Hi.isNegative() != Lo.isNegative();
+ APFloat Second = Lo;
+ if (Arg.getCategory() == APFloat::fcNormal && Lo.isFiniteNonZero()) {
+ roundingMode LoRoundingMode;
+ // The interpretation of rmTowardZero depends on the sign of the combined
+ // Arg rather than the sign of the component.
+ if (RM == rmTowardZero)
+ LoRoundingMode = Arg.isNegative() ? rmTowardPositive : rmTowardNegative;
+ // For rmNearestTiesToAway, we face a similar problem. If signs disagree,
+ // Lo is a correction *toward* zero relative to Hi. Rounding Lo
+ // "away from zero" based on its own sign would move the value in the
+ // wrong direction. As a safe proxy, we use rmNearestTiesToEven, which is
+ // direction-agnostic. We only need to bother with this if Lo is scaled
+ // down.
+ else if (RM == rmNearestTiesToAway && SignsDisagree && Exp > 0)
+ LoRoundingMode = rmNearestTiesToEven;
+ else
+ LoRoundingMode = RM;
+ Second = scalbn(Lo, -Exp, LoRoundingMode);
+ // The rmNearestTiesToEven proxy is correct most of the time, but it
+ // differs from rmNearestTiesToAway when the scaled value of Lo is an
+ // exact midpoint.
+ // NOTE: This is morally equivalent to roundTiesTowardZero.
+ if (RM == rmNearestTiesToAway && LoRoundingMode == rmNearestTiesToEven) {
+ // Re-scale the result back to check if rounding occurred.
+ const APFloat RecomposedLo = scalbn(Second, Exp, rmNearestTiesToEven);
+ if (RecomposedLo != Lo) {
+ // RoundingError tells us which direction we rounded:
+ // - RoundingError > 0: we rounded up.
+ // - RoundingError < 0: we down up.
+ const APFloat RoundingError = RecomposedLo - Lo;
+ // Determine if scalbn(Lo, -Exp) landed exactly on a midpoint.
+ // We do this by checking if the absolute rounding error is exactly
+ // half a ULP of the result.
+ const APFloat UlpOfSecond = harrisonUlp(Second);
+ const APFloat ScaledUlpOfSecond =
+ scalbn(UlpOfSecond, Exp - 1, rmNearestTiesToEven);
+ const bool IsMidpoint = abs(RoundingError) == ScaledUlpOfSecond;
+ const bool RoundedLoAway =
+ Second.isNegative() == RoundingError.isNegative();
+ // The sign of Hi and Lo disagree and we rounded Lo away: we must
+ // decrease the magnitude of Second to increase the magnitude
+ // First+Second.
+ if (IsMidpoint && RoundedLoAway)
+ Second.next(/*nextDown=*/!Second.isNegative());
+ }
+ }
+ // Handle a tricky edge case where Arg is slightly less than a power of two
+ // (e.g., Arg = 2^k - epsilon). In this situation:
+ // 1. Hi is 2^k, and Lo is a small negative value -epsilon.
+ // 2. ilogb(Arg) correctly returns k-1.
+ // 3. Our initial Exp becomes (k-1) + 1 = k.
+ // 4. Scaling Hi (2^k) by 2^-k would yield a magnitude of 1.0 and
+ // scaling Lo by 2^-k would yield zero. This would make the result 1.0
+ // which is an invalid fraction, as the required interval is [0.5, 1.0).
+ // We detect this specific case by checking if Hi is a power of two and if
+ // the scaled Lo underflowed to zero. The fix: Increment Exp to k+1. This
+ // adjusts the scale factor, causing Hi to be scaled to 0.5, which is a
+ // valid fraction.
+ if (Second.isZero() && SignsDisagree && Hi.getExactLog2Abs() != INT_MIN)
+ ++Exp;
+ }
+
+ APFloat First = scalbn(Hi, -Exp, RM);
return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
}
@@ -5749,10 +5972,6 @@ void APFloat::Profile(FoldingSetNodeID &NID) const {
NID.Add(bitcastToAPInt());
}
-/* Same as convertToInteger(integerPart*, ...), except the result is returned in
- an APSInt, whose initial bit-width and signed-ness are used to determine the
- precision of the conversion.
- */
APFloat::opStatus APFloat::convertToInteger(APSInt &result,
roundingMode rounding_mode,
bool *isExact) const {
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7f..0c0e1d0 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1377,7 +1377,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
// the true value, and a "borrow" to the left should be remembered.
int64_t borrow = 0;
for (unsigned i = 0; i < n; ++i) {
- uint64_t p = uint64_t(qp) * uint64_t(v[i]);
+ uint64_t p = qp * uint64_t(v[i]);
int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
u[j+i] = Lo_32(subres);
borrow = Hi_32(p) - Hi_32(subres);
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
}
+APInt APIntOps::mulsExtended(const APInt &C1, const APInt &C2) {
+ assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+ unsigned FullWidth = C1.getBitWidth() * 2;
+ APInt C1Ext = C1.sext(FullWidth);
+ APInt C2Ext = C2.sext(FullWidth);
+ return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muluExtended(const APInt &C1, const APInt &C2) {
+ assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+ unsigned FullWidth = C1.getBitWidth() * 2;
+ APInt C1Ext = C1.zext(FullWidth);
+ APInt C2Ext = C2.zext(FullWidth);
+ return C1Ext * C2Ext;
+}
+
APInt APIntOps::pow(const APInt &X, int64_t N) {
assert(N >= 0 && "negative exponents not supported.");
APInt Acc = APInt(X.getBitWidth(), 1);
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 10b6101..b7578dd 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -182,6 +182,7 @@ add_llvm_component_library(LLVMSupport
DivisionByConstantInfo.cpp
DAGDeltaAlgorithm.cpp
DJB.cpp
+ DXILABI.cpp
DynamicAPInt.cpp
ELFAttributes.cpp
ELFAttrParserCompact.cpp
diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp
new file mode 100644
index 0000000..082e320
--- /dev/null
+++ b/llvm/lib/Support/DXILABI.cpp
@@ -0,0 +1,33 @@
+//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of various constants and enums that are
+// required to remain stable as per the DXIL format's requirements.
+//
+// Documentation for DXIL can be found in
+// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DXILABI.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+StringRef dxil::getResourceClassName(dxil::ResourceClass RC) {
+ switch (RC) {
+ case dxil::ResourceClass::SRV:
+ return "SRV";
+ case dxil::ResourceClass::UAV:
+ return "UAV";
+ case dxil::ResourceClass::CBuffer:
+ return "CBV";
+ case dxil::ResourceClass::Sampler:
+ return "Sampler";
+ }
+ llvm_unreachable("Invalid ResourceClass enum value");
+}
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 94a04ab..bd08365 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -888,11 +888,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
Res.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
Res.One = BottomKnown.getLoBits(ResultBitsKnown);
- // If we're self-multiplying then bit[1] is guaranteed to be zero.
- if (NoUndefSelfMultiply && BitWidth > 1) {
- assert(Res.One[1] == 0 &&
- "Self-multiplication failed Quadratic Reciprocity!");
- Res.Zero.setBit(1);
+ if (NoUndefSelfMultiply) {
+ // If X has at least TZ trailing zeroes, then bit (2 * TZ + 1) must be zero.
+ unsigned TwoTZP1 = 2 * TrailZero0 + 1;
+ if (TwoTZP1 < BitWidth)
+ Res.Zero.setBit(TwoTZP1);
+
+ // If X has exactly TZ trailing zeros, then bit (2 * TZ + 2) must also be
+ // zero.
+ if (TrailZero0 < BitWidth && LHS.One[TrailZero0]) {
+ unsigned TwoTZP2 = TwoTZP1 + 1;
+ if (TwoTZP2 < BitWidth)
+ Res.Zero.setBit(TwoTZP2);
+ }
}
return Res;
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 601f11f..1c4645a 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
std::unique_ptr<MB> Result(
new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
RequiresNullTerminator, FD, MapSize, Offset, EC));
- if (!EC)
- return std::move(Result);
+ if (!EC) {
+ // On at least Linux, and possibly on other systems, mmap may return pages
+ // from the page cache that are not properly filled with trailing zeroes,
+ // if some prior user of the page wrote non-zero bytes. Detect this and
+ // don't use mmap in that case.
+ if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0')
+ return std::move(Result);
+ }
}
#ifdef __MVS__
diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp
index 83143a7..a602165 100644
--- a/llvm/lib/Support/SmallPtrSet.cpp
+++ b/llvm/lib/Support/SmallPtrSet.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemAlloc.h"
#include <algorithm>
@@ -28,7 +29,7 @@ void SmallPtrSetImplBase::shrink_and_clear() {
// Reduce the number of buckets.
unsigned Size = size();
CurArraySize = Size > 16 ? 1 << (Log2_32_Ceil(Size) + 1) : 32;
- NumNonEmpty = NumTombstones = 0;
+ NumEntries = NumTombstones = 0;
// Install the new array. Clear all the buckets to empty.
CurArray = (const void**)safe_malloc(sizeof(void*) * CurArraySize);
@@ -41,7 +42,8 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) {
if (LLVM_UNLIKELY(size() * 4 >= CurArraySize * 3)) {
// If more than 3/4 of the array is full, grow.
Grow(CurArraySize < 64 ? 128 : CurArraySize * 2);
- } else if (LLVM_UNLIKELY(CurArraySize - NumNonEmpty < CurArraySize / 8)) {
+ } else if (LLVM_UNLIKELY(CurArraySize - NumEntries - NumTombstones <
+ CurArraySize / 8)) {
// If fewer of 1/8 of the array is empty (meaning that many are filled with
// tombstones), rehash.
Grow(CurArraySize);
@@ -55,8 +57,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) {
// Otherwise, insert it!
if (*Bucket == getTombstoneMarker())
--NumTombstones;
- else
- ++NumNonEmpty; // Track density.
+ ++NumEntries;
*Bucket = Ptr;
incrementEpoch();
return std::make_pair(Bucket, true);
@@ -110,8 +111,7 @@ const void *const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
/// Grow - Allocate a larger backing store for the buckets and move it over.
///
void SmallPtrSetImplBase::Grow(unsigned NewSize) {
- const void **OldBuckets = CurArray;
- const void **OldEnd = EndPointer();
+ auto OldBuckets = buckets();
bool WasSmall = isSmall();
// Install the new array. Clear all the buckets to empty.
@@ -123,16 +123,14 @@ void SmallPtrSetImplBase::Grow(unsigned NewSize) {
memset(CurArray, -1, NewSize*sizeof(void*));
// Copy over all valid entries.
- for (const void **BucketPtr = OldBuckets; BucketPtr != OldEnd; ++BucketPtr) {
+ for (const void *&Bucket : OldBuckets) {
// Copy over the element if it is valid.
- const void *Elt = *BucketPtr;
- if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
- *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+ if (Bucket != getTombstoneMarker() && Bucket != getEmptyMarker())
+ *const_cast<void **>(FindBucketFor(Bucket)) = const_cast<void *>(Bucket);
}
if (!WasSmall)
- free(OldBuckets);
- NumNonEmpty -= NumTombstones;
+ free(OldBuckets.begin());
NumTombstones = 0;
IsSmall = false;
}
@@ -193,9 +191,9 @@ void SmallPtrSetImplBase::copyHelper(const SmallPtrSetImplBase &RHS) {
CurArraySize = RHS.CurArraySize;
// Copy over the contents from the other set
- std::copy(RHS.CurArray, RHS.EndPointer(), CurArray);
+ llvm::copy(RHS.buckets(), CurArray);
- NumNonEmpty = RHS.NumNonEmpty;
+ NumEntries = RHS.NumEntries;
NumTombstones = RHS.NumTombstones;
}
@@ -217,7 +215,7 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage,
if (RHS.isSmall()) {
// Copy a small RHS rather than moving.
CurArray = SmallStorage;
- std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, CurArray);
+ llvm::copy(RHS.small_buckets(), CurArray);
} else {
CurArray = RHS.CurArray;
RHS.CurArray = RHSSmallStorage;
@@ -225,13 +223,13 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage,
// Copy the rest of the trivial members.
CurArraySize = RHS.CurArraySize;
- NumNonEmpty = RHS.NumNonEmpty;
+ NumEntries = RHS.NumEntries;
NumTombstones = RHS.NumTombstones;
IsSmall = RHS.IsSmall;
// Make the RHS small and empty.
RHS.CurArraySize = SmallSize;
- RHS.NumNonEmpty = 0;
+ RHS.NumEntries = 0;
RHS.NumTombstones = 0;
RHS.IsSmall = true;
}
@@ -245,54 +243,42 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage,
if (!this->isSmall() && !RHS.isSmall()) {
std::swap(this->CurArray, RHS.CurArray);
std::swap(this->CurArraySize, RHS.CurArraySize);
- std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+ std::swap(this->NumEntries, RHS.NumEntries);
std::swap(this->NumTombstones, RHS.NumTombstones);
return;
}
// FIXME: From here on we assume that both sets have the same small size.
- // If only RHS is small, copy the small elements into LHS and move the pointer
- // from LHS to RHS.
- if (!this->isSmall() && RHS.isSmall()) {
- std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, SmallStorage);
- std::swap(RHS.CurArraySize, this->CurArraySize);
- std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+ // Both a small, just swap the small elements.
+ if (this->isSmall() && RHS.isSmall()) {
+ unsigned MinEntries = std::min(this->NumEntries, RHS.NumEntries);
+ std::swap_ranges(this->CurArray, this->CurArray + MinEntries, RHS.CurArray);
+ if (this->NumEntries > MinEntries) {
+ std::copy(this->CurArray + MinEntries, this->CurArray + this->NumEntries,
+ RHS.CurArray + MinEntries);
+ } else {
+ std::copy(RHS.CurArray + MinEntries, RHS.CurArray + RHS.NumEntries,
+ this->CurArray + MinEntries);
+ }
+ assert(this->CurArraySize == RHS.CurArraySize);
+ std::swap(this->NumEntries, RHS.NumEntries);
std::swap(this->NumTombstones, RHS.NumTombstones);
- RHS.CurArray = this->CurArray;
- RHS.IsSmall = false;
- this->CurArray = SmallStorage;
- this->IsSmall = true;
return;
}
- // If only LHS is small, copy the small elements into RHS and move the pointer
- // from RHS to LHS.
- if (this->isSmall() && !RHS.isSmall()) {
- std::copy(this->CurArray, this->CurArray + this->NumNonEmpty,
- RHSSmallStorage);
- std::swap(RHS.CurArraySize, this->CurArraySize);
- std::swap(RHS.NumNonEmpty, this->NumNonEmpty);
- std::swap(RHS.NumTombstones, this->NumTombstones);
- this->CurArray = RHS.CurArray;
- this->IsSmall = false;
- RHS.CurArray = RHSSmallStorage;
- RHS.IsSmall = true;
- return;
- }
-
- // Both a small, just swap the small elements.
- assert(this->isSmall() && RHS.isSmall());
- unsigned MinNonEmpty = std::min(this->NumNonEmpty, RHS.NumNonEmpty);
- std::swap_ranges(this->CurArray, this->CurArray + MinNonEmpty, RHS.CurArray);
- if (this->NumNonEmpty > MinNonEmpty) {
- std::copy(this->CurArray + MinNonEmpty, this->CurArray + this->NumNonEmpty,
- RHS.CurArray + MinNonEmpty);
- } else {
- std::copy(RHS.CurArray + MinNonEmpty, RHS.CurArray + RHS.NumNonEmpty,
- this->CurArray + MinNonEmpty);
- }
- assert(this->CurArraySize == RHS.CurArraySize);
- std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
- std::swap(this->NumTombstones, RHS.NumTombstones);
+ // If only one side is small, copy the small elements into the large side and
+ // move the pointer from the large side to the small side.
+ SmallPtrSetImplBase &SmallSide = this->isSmall() ? *this : RHS;
+ SmallPtrSetImplBase &LargeSide = this->isSmall() ? RHS : *this;
+ const void **LargeSideInlineStorage =
+ this->isSmall() ? RHSSmallStorage : SmallStorage;
+ llvm::copy(SmallSide.small_buckets(), LargeSideInlineStorage);
+ std::swap(LargeSide.CurArraySize, SmallSide.CurArraySize);
+ std::swap(LargeSide.NumEntries, SmallSide.NumEntries);
+ std::swap(LargeSide.NumTombstones, SmallSide.NumTombstones);
+ SmallSide.CurArray = LargeSide.CurArray;
+ SmallSide.IsSmall = false;
+ LargeSide.CurArray = LargeSideInlineStorage;
+ LargeSide.IsSmall = true;
}
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index cc02cae..31fb1e8 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -876,6 +876,12 @@ void mapped_file_region::unmapImpl() {
::munmap(Mapping, Size);
}
+std::error_code mapped_file_region::sync() const {
+ if (int Res = ::msync(Mapping, Size, MS_SYNC))
+ return std::error_code(Res, std::generic_category());
+ return std::error_code();
+}
+
void mapped_file_region::dontNeedImpl() {
assert(Mode == mapped_file_region::readonly);
if (!Mapping)
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index fdf9d54..9001c19 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1006,6 +1006,14 @@ void mapped_file_region::unmapImpl() {
void mapped_file_region::dontNeedImpl() {}
+std::error_code mapped_file_region::sync() const {
+ if (!::FlushViewOfFile(Mapping, Size))
+ return mapWindowsError(GetLastError());
+ if (!::FlushFileBuffers(FileHandle))
+ return mapWindowsError(GetLastError());
+ return std::error_code();
+}
+
int mapped_file_region::alignment() {
SYSTEM_INFO SysInfo;
::GetSystemInfo(&SysInfo);
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 3f318e2..67622a9 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -3064,11 +3064,11 @@ const Init *Record::getValueInit(StringRef FieldName) const {
}
StringRef Record::getValueAsString(StringRef FieldName) const {
- std::optional<StringRef> S = getValueAsOptionalString(FieldName);
- if (!S)
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
- return *S;
+ const Init *I = getValueInit(FieldName);
+ if (const auto *SI = dyn_cast<StringInit>(I))
+ return SI->getValue();
+ PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
+ "' exists but does not have a string value");
}
std::optional<StringRef>
@@ -3088,24 +3088,16 @@ Record::getValueAsOptionalString(StringRef FieldName) const {
}
const BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *BI = dyn_cast<BitsInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *BI = dyn_cast<BitsInit>(I))
return BI;
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
"' exists but does not have a bits value");
}
const ListInit *Record::getValueAsListInit(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *LI = dyn_cast<ListInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *LI = dyn_cast<ListInit>(I))
return LI;
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
"' exists but does not have a list value");
@@ -3127,17 +3119,13 @@ Record::getValueAsListOfDefs(StringRef FieldName) const {
}
int64_t Record::getValueAsInt(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *II = dyn_cast<IntInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *II = dyn_cast<IntInit>(I))
return II->getValue();
- PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" +
- FieldName +
- "' exists but does not have an int value: " +
- R->getValue()->getAsString());
+ PrintFatalError(
+ getLoc(),
+ Twine("Record `") + getName() + "', field `" + FieldName +
+ "' exists but does not have an int value: " + I->getAsString());
}
std::vector<int64_t>
@@ -3173,67 +3161,47 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
}
const Record *Record::getValueAsDef(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *DI = dyn_cast<DefInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *DI = dyn_cast<DefInit>(I))
return DI->getDef();
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
FieldName + "' does not have a def initializer!");
}
const Record *Record::getValueAsOptionalDef(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *DI = dyn_cast<DefInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *DI = dyn_cast<DefInit>(I))
return DI->getDef();
- if (isa<UnsetInit>(R->getValue()))
+ if (isa<UnsetInit>(I))
return nullptr;
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
FieldName + "' does not have either a def initializer or '?'!");
}
bool Record::getValueAsBit(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *BI = dyn_cast<BitInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *BI = dyn_cast<BitInit>(I))
return BI->getValue();
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
FieldName + "' does not have a bit initializer!");
}
bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName.str() + "'!\n");
-
- if (isa<UnsetInit>(R->getValue())) {
+ const Init *I = getValueInit(FieldName);
+ if (isa<UnsetInit>(I)) {
Unset = true;
return false;
}
Unset = false;
- if (const auto *BI = dyn_cast<BitInit>(R->getValue()))
+ if (const auto *BI = dyn_cast<BitInit>(I))
return BI->getValue();
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
FieldName + "' does not have a bit initializer!");
}
const DagInit *Record::getValueAsDag(StringRef FieldName) const {
- const RecordVal *R = getValue(FieldName);
- if (!R || !R->getValue())
- PrintFatalError(getLoc(), "Record `" + getName() +
- "' does not have a field named `" + FieldName + "'!\n");
-
- if (const auto *DI = dyn_cast<DagInit>(R->getValue()))
+ const Init *I = getValueInit(FieldName);
+ if (const auto *DI = dyn_cast<DagInit>(I))
return DI;
PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
FieldName + "' does not have a dag initializer!");
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 5496ebd..8d0ff41 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
+FunctionPass *createMachineSMEABIPass();
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -111,6 +112,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
void initializeSMEABIPass(PassRegistry &);
void initializeSMEPeepholeOptPass(PassRegistry &);
+void initializeMachineSMEABIPass(PassRegistry &);
void initializeSVEIntrinsicOptsPass(PassRegistry &);
void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index ad8368e..1169f26 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -316,6 +316,12 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
ThunkArgTranslation::PointerIndirection};
};
+ if (T->isHalfTy()) {
+ // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+ Out << "__llvm_h__";
+ return direct(T);
+ }
+
if (T->isFloatTy()) {
Out << "f";
return direct(T);
@@ -327,8 +333,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
}
if (T->isFloatingPointTy()) {
- report_fatal_error(
- "Only 32 and 64 bit floating points are supported for ARM64EC thunks");
+ report_fatal_error("Only 16, 32, and 64 bit floating points are supported "
+ "for ARM64EC thunks");
}
auto &DL = M->getDataLayout();
@@ -342,8 +348,16 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
uint64_t ElementCnt = T->getArrayNumElements();
uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8;
uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
- if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
- Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
+ if (ElementTy->isHalfTy() || ElementTy->isFloatTy() ||
+ ElementTy->isDoubleTy()) {
+ if (ElementTy->isHalfTy())
+ // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+ Out << "__llvm_H__";
+ else if (ElementTy->isFloatTy())
+ Out << "F";
+ else if (ElementTy->isDoubleTy())
+ Out << "D";
+ Out << TotalSizeBytes;
if (Alignment.value() >= 16 && !Ret)
Out << "a" << Alignment.value();
if (TotalSizeBytes <= 8) {
@@ -355,8 +369,9 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
return pointerIndirection(T);
}
} else if (T->isFloatingPointTy()) {
- report_fatal_error("Only 32 and 64 bit floating points are supported for "
- "ARM64EC thunks");
+ report_fatal_error(
+ "Only 16, 32, and 64 bit floating points are supported "
+ "for ARM64EC thunks");
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 787a1a8..cc46159 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -75,8 +75,10 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
auto &It = PendingMembers[0];
CCAssignFn *AssignFn =
TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+ // FIXME: Get the correct original type.
+ Type *OrigTy = EVT(It.getValVT()).getTypeForEVT(State.getContext());
if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
- ArgFlags, State))
+ ArgFlags, OrigTy, State))
llvm_unreachable("Call operand has unhandled type");
// Return the flags to how they were before.
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index 63185a9..7105fa6 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -18,52 +18,63 @@
namespace llvm {
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
CCState &State);
} // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 201bfe0..57dcd68 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -92,8 +92,9 @@ private:
bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
- MachineBasicBlock *expandRestoreZA(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI);
+ MachineBasicBlock *
+ expandCommitOrRestoreZASave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
};
@@ -528,6 +529,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
UseRev = true;
}
break;
+ case AArch64::Destructive2xRegImmUnpred:
+ // EXT_ZZI_CONSTRUCTIVE Zd, Zs, Imm
+ // ==> MOVPRFX Zd Zs; EXT_ZZI Zd, Zd, Zs, Imm
+ std::tie(DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 1, 2);
+ break;
default:
llvm_unreachable("Unsupported Destructive Operand type");
}
@@ -548,6 +554,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
break;
case AArch64::DestructiveUnaryPassthru:
case AArch64::DestructiveBinaryImm:
+ case AArch64::Destructive2xRegImmUnpred:
DOPRegIsUnique = true;
break;
case AArch64::DestructiveTernaryCommWithRev:
@@ -674,6 +681,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
.add(MI.getOperand(SrcIdx))
.add(MI.getOperand(Src2Idx));
break;
+ case AArch64::Destructive2xRegImmUnpred:
+ DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+ .add(MI.getOperand(SrcIdx))
+ .add(MI.getOperand(Src2Idx));
+ break;
}
if (PRFX) {
@@ -979,10 +991,15 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
return true;
}
-MachineBasicBlock *
-AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) {
+static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111;
+
+MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineInstr &MI = *MBBI;
+ bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo;
+ assert((MI.getOpcode() == AArch64::RestoreZAPseudo ||
+ MI.getOpcode() == AArch64::CommitZASavePseudo) &&
+ "Expected ZA commit or restore");
assert((std::next(MBBI) != MBB.end() ||
MI.getParent()->successors().begin() !=
MI.getParent()->successors().end()) &&
@@ -990,21 +1007,23 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
// Compare TPIDR2_EL0 value against 0.
DebugLoc DL = MI.getDebugLoc();
- MachineInstrBuilder Cbz = BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX))
- .add(MI.getOperand(0));
+ MachineInstrBuilder Branch =
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX))
+ .add(MI.getOperand(0));
// Split MBB and create two new blocks:
// - MBB now contains all instructions before RestoreZAPseudo.
- // - SMBB contains the RestoreZAPseudo instruction only.
- // - EndBB contains all instructions after RestoreZAPseudo.
+ // - SMBB contains the [Commit|RestoreZA]Pseudo instruction only.
+ // - EndBB contains all instructions after [Commit|RestoreZA]Pseudo.
MachineInstr &PrevMI = *std::prev(MBBI);
MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end()
? *SMBB->successors().begin()
: SMBB->splitAt(MI, /*UpdateLiveIns*/ true);
- // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB.
- Cbz.addMBB(SMBB);
+ // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB.
+ Branch.addMBB(SMBB);
BuildMI(&MBB, DL, TII->get(AArch64::B))
.addMBB(EndBB);
MBB.addSuccessor(EndBB);
@@ -1012,11 +1031,30 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB,
// Replace the pseudo with a call (BL).
MachineInstrBuilder MIB =
BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL));
- MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
+ // Copy operands (mainly the regmask) from the pseudo.
for (unsigned I = 2; I < MI.getNumOperands(); ++I)
MIB.add(MI.getOperand(I));
- BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+ if (IsRestoreZA) {
+ // Mark the TPIDR2 block pointer (X0) as an implicit use.
+ MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
+ } else /*CommitZA*/ {
+ [[maybe_unused]] auto *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ // Clear TPIDR2_EL0.
+ BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::TPIDR2_EL0)
+ .addReg(AArch64::XZR);
+ bool ZeroZA = MI.getOperand(1).getImm() != 0;
+ if (ZeroZA) {
+ assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!");
+ BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M))
+ .addImm(ZERO_ALL_ZA_MASK)
+ .addDef(AArch64::ZAB0, RegState::ImplicitDefine);
+ }
+ }
+
+ BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
MI.eraseFromParent();
return EndBB;
}
@@ -1236,14 +1274,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
.add(MI.getOperand(3));
transferImpOps(MI, I, I);
} else {
+ unsigned RegState =
+ getRenamableRegState(MI.getOperand(1).isRenamable()) |
+ getKillRegState(
+ MI.getOperand(1).isKill() &&
+ MI.getOperand(1).getReg() != MI.getOperand(2).getReg() &&
+ MI.getOperand(1).getReg() != MI.getOperand(3).getReg());
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
: AArch64::ORRv16i8))
.addReg(DstReg,
RegState::Define |
getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(1))
- .add(MI.getOperand(1));
+ .addReg(MI.getOperand(1).getReg(), RegState)
+ .addReg(MI.getOperand(1).getReg(), RegState);
auto I2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
@@ -1629,8 +1673,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandCALL_BTI(MBB, MBBI);
case AArch64::StoreSwiftAsyncContext:
return expandStoreSwiftAsyncContext(MBB, MBBI);
+ case AArch64::CommitZASavePseudo:
case AArch64::RestoreZAPseudo: {
- auto *NewMBB = expandRestoreZA(MBB, MBBI);
+ auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI);
if (NewMBB != &MBB)
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
@@ -1641,6 +1686,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
}
+ case AArch64::InOutZAUsePseudo:
+ case AArch64::RequiresZASavePseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9d74bb5..cf34498 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -267,7 +267,7 @@ private:
private:
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
- unsigned &NumBytes);
+ SmallVectorImpl<Type *> &OrigTys, unsigned &NumBytes);
bool finishCall(CallLoweringInfo &CLI, unsigned NumBytes);
public:
@@ -3011,11 +3011,13 @@ bool AArch64FastISel::fastLowerArguments() {
bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
SmallVectorImpl<MVT> &OutVTs,
+ SmallVectorImpl<Type *> &OrigTys,
unsigned &NumBytes) {
CallingConv::ID CC = CLI.CallConv;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
- CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+ CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, OrigTys,
+ CCAssignFnForCall(CC));
// Get a count of how many bytes are to be pushed on the stack.
NumBytes = CCInfo.getStackSize();
@@ -3194,6 +3196,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Set up the argument vectors.
SmallVector<MVT, 16> OutVTs;
+ SmallVector<Type *, 16> OrigTys;
OutVTs.reserve(CLI.OutVals.size());
for (auto *Val : CLI.OutVals) {
@@ -3207,6 +3210,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
OutVTs.push_back(VT);
+ OrigTys.push_back(Val->getType());
}
Address Addr;
@@ -3222,7 +3226,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Handle the arguments now that we've gotten them.
unsigned NumBytes;
- if (!processCallArgs(CLI, OutVTs, NumBytes))
+ if (!processCallArgs(CLI, OutVTs, OrigTys, NumBytes))
return false;
const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
@@ -3574,12 +3578,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
Args.reserve(II->arg_size());
// Populate the argument list.
- for (auto &Arg : II->args()) {
- ArgListEntry Entry;
- Entry.Val = Arg;
- Entry.Ty = Arg->getType();
- Args.push_back(Entry);
- }
+ for (auto &Arg : II->args())
+ Args.emplace_back(Arg);
CallLoweringInfo CLI;
MCContext &Ctx = MF->getContext();
@@ -4870,12 +4870,8 @@ bool AArch64FastISel::selectFRem(const Instruction *I) {
Args.reserve(I->getNumOperands());
// Populate the argument list.
- for (auto &Arg : I->operands()) {
- ArgListEntry Entry;
- Entry.Val = Arg;
- Entry.Ty = Arg->getType();
- Args.push_back(Entry);
- }
+ for (auto &Arg : I->operands())
+ Args.emplace_back(Arg);
CallLoweringInfo CLI;
MCContext &Ctx = MF->getContext();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 885f2a9..fddde66 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1475,24 +1475,26 @@ static bool requiresSaveVG(const MachineFunction &MF) {
return true;
}
-bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO,
+ RTLIB::Libcall LC) {
+ return MO.isSymbol() &&
+ StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI,
+ const TargetLowering &TLI) {
unsigned Opc = MBBI->getOpcode();
if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
Opc == AArch64::UBFMXri)
return true;
- if (requiresGetVGCall(*MBBI->getMF())) {
- if (Opc == AArch64::ORRXrr)
- return true;
+ if (!requiresGetVGCall(*MBBI->getMF()))
+ return false;
- if (Opc == AArch64::BL) {
- auto Op1 = MBBI->getOperand(0);
- return Op1.isSymbol() &&
- (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
- }
- }
+ if (Opc == AArch64::BL)
+ return matchLibcall(TLI, MBBI->getOperand(0), RTLIB::SMEABI_GET_CURRENT_VG);
- return false;
+ return Opc == AArch64::ORRXrr;
}
// Convert callee-save register save/restore instruction to do stack pointer
@@ -1511,9 +1513,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// functions, we need to do this for both the streaming and non-streaming
// vector length. Move past these instructions if necessary.
MachineFunction &MF = *MBB.getParent();
- if (requiresSaveVG(MF))
- while (isVGInstruction(MBBI))
+ if (requiresSaveVG(MF)) {
+ auto &TLI = *MF.getSubtarget().getTargetLowering();
+ while (isVGInstruction(MBBI, TLI))
++MBBI;
+ }
switch (MBBI->getOpcode()) {
default:
@@ -2097,11 +2101,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Move past the saves of the callee-saved registers, fixing up the offsets
// and pre-inc if we decided to combine the callee-save and local stack
// pointer bump above.
+ auto &TLI = *MF.getSubtarget().getTargetLowering();
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
if (CombineSPBump &&
// Only fix-up frame-setup load/store instructions.
- (!requiresSaveVG(MF) || !isVGInstruction(MBBI)))
+ (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI)))
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
++MBBI;
@@ -3468,6 +3473,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
+ auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
@@ -3581,11 +3587,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
.addReg(AArch64::X0, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
- const uint32_t *RegMask = TRI->getCallPreservedMask(
- MF,
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+ RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
+ const uint32_t *RegMask =
+ TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC));
BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
- .addExternalSymbol("__arm_get_current_vg")
+ .addExternalSymbol(TLI.getLibcallName(LC))
.addRegMask(RegMask)
.addReg(AArch64::X0, RegState::ImplicitDefine)
.setMIFlag(MachineInstr::FrameSetup);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ad42f4b..bc786f4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7617,16 +7617,29 @@ bool AArch64DAGToDAGISel::SelectAnyPredicate(SDValue N) {
bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
SDValue &Base, SDValue &Offset,
unsigned Scale) {
- // Try to untangle an ADD node into a 'reg + offset'
- if (CurDAG->isBaseWithConstantOffset(N))
- if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ auto MatchConstantOffset = [&](SDValue CN) -> SDValue {
+ if (auto *C = dyn_cast<ConstantSDNode>(CN)) {
int64_t ImmOff = C->getSExtValue();
- if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) {
- Base = N.getOperand(0);
- Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
- return true;
- }
+ if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0)))
+ return CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
}
+ return SDValue();
+ };
+
+ if (SDValue C = MatchConstantOffset(N)) {
+ Base = CurDAG->getConstant(0, SDLoc(N), MVT::i32);
+ Offset = C;
+ return true;
+ }
+
+ // Try to untangle an ADD node into a 'reg + offset'
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (SDValue C = MatchConstantOffset(N.getOperand(1))) {
+ Base = N.getOperand(0);
+ Offset = C;
+ return true;
+ }
+ }
// By default, just match reg + 0.
Base = N;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3c06c6a..e896370 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17,6 +17,7 @@
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "Utils/AArch64SMEAttributes.h"
@@ -1998,6 +1999,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(Op, MVT::f16, Promote);
}
+const AArch64TargetMachine &AArch64TargetLowering::getTM() const {
+ return static_cast<const AArch64TargetMachine &>(getTargetMachine());
+}
+
void AArch64TargetLowering::addTypeForNEON(MVT VT) {
assert(VT.isVector() && "VT should be a vector type");
@@ -3083,13 +3088,12 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
if (FuncInfo->isSMESaveBufferUsed()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
- .addExternalSymbol("__arm_sme_state_size")
+ .addExternalSymbol(getLibcallName(LC))
.addReg(AArch64::X0, RegState::ImplicitDefine)
- .addRegMask(TRI->getCallPreservedMask(
- *MF, CallingConv::
- AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+ .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
MI.getOperand(0).getReg())
.addReg(AArch64::X0);
@@ -3101,6 +3105,30 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ Register ResultReg = MI.getOperand(0).getReg();
+ if (FuncInfo->isPStateSMRegUsed()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+ .addExternalSymbol(getLibcallName(LC))
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
+ .addReg(AArch64::X0);
+ } else {
+ assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
+ "Expected no users of the entry pstate.sm!");
+ }
+ MI.eraseFromParent();
+ return BB;
+}
+
// Helper function to find the instruction that defined a virtual register.
// If unable to find such instruction, returns nullptr.
static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
@@ -3216,6 +3244,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitAllocateSMESaveBuffer(MI, BB);
case AArch64::GetSMESaveSize:
return EmitGetSMESaveSize(MI, BB);
+ case AArch64::EntryPStateSM:
+ return EmitEntryPStateSM(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STATEPOINT:
@@ -3320,7 +3350,8 @@ static bool isZerosVector(const SDNode *N) {
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
-static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC,
+ SDValue RHS = {}) {
switch (CC) {
default:
llvm_unreachable("Unknown condition code!");
@@ -3331,9 +3362,9 @@ static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
case ISD::SETGT:
return AArch64CC::GT;
case ISD::SETGE:
- return AArch64CC::GE;
+ return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;
case ISD::SETLT:
- return AArch64CC::LT;
+ return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;
case ISD::SETLE:
return AArch64CC::LE;
case ISD::SETUGT:
@@ -3492,6 +3523,13 @@ bool isLegalCmpImmed(APInt C) {
return isLegalArithImmed(C.abs().getZExtValue());
}
+unsigned numberOfInstrToLoadImm(APInt C) {
+ uint64_t Imm = C.getZExtValue();
+ SmallVector<AArch64_IMM::ImmInsnModel> Insn;
+ AArch64_IMM::expandMOVImm(Imm, 32, Insn);
+ return Insn.size();
+}
+
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
// 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
if (Op->getFlags().hasNoSignedWrap())
@@ -3782,7 +3820,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
- OutCC = changeIntCCToAArch64CC(CC);
+ OutCC = changeIntCCToAArch64CC(CC, RHS);
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
@@ -3961,6 +3999,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// CC has already been adjusted.
RHS = DAG.getConstant(0, DL, VT);
} else if (!isLegalCmpImmed(C)) {
+ unsigned NumImmForC = numberOfInstrToLoadImm(C);
// Constant does not fit, try adjusting it by one?
switch (CC) {
default:
@@ -3969,43 +4008,49 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
case ISD::SETGE:
if (!C.isMinSignedValue()) {
APInt CMinusOne = C - 1;
- if (isLegalCmpImmed(CMinusOne)) {
+ if (isLegalCmpImmed(CMinusOne) ||
+ (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
RHS = DAG.getConstant(CMinusOne, DL, VT);
}
}
break;
case ISD::SETULT:
- case ISD::SETUGE:
- if (!C.isZero()) {
- APInt CMinusOne = C - 1;
- if (isLegalCmpImmed(CMinusOne)) {
- CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
- RHS = DAG.getConstant(CMinusOne, DL, VT);
- }
+ case ISD::SETUGE: {
+ // C is not 0 because it is a legal immediate.
+ assert(!C.isZero() && "C should not be zero here");
+ APInt CMinusOne = C - 1;
+ if (isLegalCmpImmed(CMinusOne) ||
+ (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
+ CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+ RHS = DAG.getConstant(CMinusOne, DL, VT);
}
break;
+ }
case ISD::SETLE:
case ISD::SETGT:
if (!C.isMaxSignedValue()) {
APInt CPlusOne = C + 1;
- if (isLegalCmpImmed(CPlusOne)) {
+ if (isLegalCmpImmed(CPlusOne) ||
+ (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
RHS = DAG.getConstant(CPlusOne, DL, VT);
}
}
break;
case ISD::SETULE:
- case ISD::SETUGT:
+ case ISD::SETUGT: {
if (!C.isAllOnes()) {
APInt CPlusOne = C + 1;
- if (isLegalCmpImmed(CPlusOne)) {
+ if (isLegalCmpImmed(CPlusOne) ||
+ (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
RHS = DAG.getConstant(CPlusOne, DL, VT);
}
}
break;
}
+ }
}
}
@@ -4079,7 +4124,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
if (!Cmp) {
Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64CC = changeIntCCToAArch64CC(CC, RHS);
}
AArch64cc = getCondCode(DAG, AArch64CC);
return Cmp;
@@ -5174,13 +5219,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
ArgListTy Args;
- ArgListEntry Entry;
-
- Entry.Node = Arg;
- Entry.Ty = ArgTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(Arg, ArgTy);
RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
: RTLIB::SINCOS_STRET_F32;
@@ -5711,15 +5750,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
SDValue Chain, SDLoc DL,
EVT VT) const {
- SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
Type *RetTy = StructType::get(Int64Ty, Int64Ty);
TargetLowering::CallLoweringInfo CLI(DAG);
ArgListTy Args;
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
- RetTy, Callee, std::move(Args));
+ getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -7886,8 +7925,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
}
- bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+ bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
+ Ins[i].OrigTy, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
@@ -8132,19 +8171,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
+ if (Attrs.hasStreamingCompatibleInterface()) {
+ SDValue EntryPStateSM =
+ DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
+ DAG.getVTList(MVT::i64, MVT::Other), {Chain});
+
+ // Copy the value to a virtual register, and save that in FuncInfo.
+ Register EntryPStateSMReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
+ EntryPStateSM);
+ FuncInfo->setPStateSMReg(EntryPStateSMReg);
+ }
+
// Insert the SMSTART if this is a locally streaming function and
// make sure it is Glued to the last CopyFromReg value.
if (IsLocallyStreaming) {
- SDValue PStateSM;
- if (Attrs.hasStreamingCompatibleInterface()) {
- PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
- Register Reg = MF.getRegInfo().createVirtualRegister(
- getRegClassFor(PStateSM.getValueType().getSimpleVT()));
- FuncInfo->setPStateSMReg(Reg);
- Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
+ if (Attrs.hasStreamingCompatibleInterface())
Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
- AArch64SME::IfCallerIsNonStreaming, PStateSM);
- } else
+ AArch64SME::IfCallerIsNonStreaming);
+ else
Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
AArch64SME::Always);
@@ -8244,53 +8290,54 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- // Create a 16 Byte TPIDR2 object. The dynamic buffer
- // will be expanded and stored in the static object later using a pseudonode.
- if (Attrs.hasZAState()) {
- TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
- TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
- SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getConstant(1, DL, MVT::i32));
-
- SDValue Buffer;
- if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
- Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
- DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
- } else {
- SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
- Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
- DAG.getVTList(MVT::i64, MVT::Other),
- {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
- MFI.CreateVariableSizedObject(Align(16), nullptr);
- }
- Chain = DAG.getNode(
- AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
- {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
- } else if (Attrs.hasAgnosticZAInterface()) {
- // Call __arm_sme_state_size().
- SDValue BufferSize =
- DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
- DAG.getVTList(MVT::i64, MVT::Other), Chain);
- Chain = BufferSize.getValue(1);
-
- SDValue Buffer;
- if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
- Buffer =
- DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
- DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
- } else {
- // Allocate space dynamically.
- Buffer = DAG.getNode(
- ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
- {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
- MFI.CreateVariableSizedObject(Align(16), nullptr);
+ if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+ // Old SME ABI lowering (deprecated):
+ // Create a 16 Byte TPIDR2 object. The dynamic buffer
+ // will be expanded and stored in the static object later using a
+ // pseudonode.
+ if (Attrs.hasZAState()) {
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+ TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
+ SDValue Buffer;
+ if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+ Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
+ DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
+ } else {
+ SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
+ DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ Chain = DAG.getNode(
+ AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
+ {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ // Call __arm_sme_state_size().
+ SDValue BufferSize =
+ DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
+ DAG.getVTList(MVT::i64, MVT::Other), Chain);
+ Chain = BufferSize.getValue(1);
+ SDValue Buffer;
+ if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+ Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
+ DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, BufferSize});
+ } else {
+ // Allocate space dynamically.
+ Buffer = DAG.getNode(
+ ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ // Copy the value to a virtual register, and save that in FuncInfo.
+ Register BufferPtr =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ FuncInfo->setSMESaveBufferAddr(BufferPtr);
+ Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
}
-
- // Copy the value to a virtual register, and save that in FuncInfo.
- Register BufferPtr =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
- FuncInfo->setSMESaveBufferAddr(BufferPtr);
- Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
}
if (CallConv == CallingConv::PreserveNone) {
@@ -8307,6 +8354,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
}
+ if (getTM().useNewSMEABILowering()) {
+ // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
+ if (Attrs.isNewZT0())
+ Chain = DAG.getNode(
+ ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
+ DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32));
+ }
+
return Chain;
}
@@ -8557,19 +8613,20 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
// FIXME: CCAssignFnForCall should be called once, for the call and not per
// argument. This logic should exactly mirror LowerFormalArguments.
CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
- bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+ bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+ Outs[i].OrigTy, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
}
static SMECallAttrs
-getSMECallAttrs(const Function &Caller,
+getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,
const TargetLowering::CallLoweringInfo &CLI) {
if (CLI.CB)
- return SMECallAttrs(*CLI.CB);
+ return SMECallAttrs(*CLI.CB, &TLI);
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
- return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol()));
+ return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
}
@@ -8591,7 +8648,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// SME Streaming functions are not eligible for TCO as they may require
// the streaming mode or ZA to be restored after returning from the call.
- SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI);
+ SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingAllZAState() ||
CallAttrs.caller().hasStreamingBody())
@@ -8834,8 +8891,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
bool Enable, SDValue Chain,
SDValue InGlue,
- unsigned Condition,
- SDValue PStateSM) const {
+ unsigned Condition) const {
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
@@ -8847,9 +8903,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
SmallVector<SDValue> Ops = {Chain, MSROp};
unsigned Opcode;
if (Condition != AArch64SME::Always) {
+ FuncInfo->setPStateSMRegUsed(true);
+ Register PStateReg = FuncInfo->getPStateSMReg();
+ assert(PStateReg.isValid() && "PStateSM Register is invalid");
+ SDValue PStateSM =
+ DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
+ // Use chain and glue from the CopyFromReg.
+ Ops[0] = PStateSM.getValue(1);
+ InGlue = PStateSM.getValue(2);
SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
- assert(PStateSM && "PStateSM should be defined");
Ops.push_back(ConditionOp);
Ops.push_back(PStateSM);
} else {
@@ -8871,22 +8934,19 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setSMESaveBufferUsed();
-
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = PointerType::getUnqual(*DAG.getContext());
- Entry.Node =
- DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
- Args.push_back(Entry);
-
- SDValue Callee =
- DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
- TLI.getPointerTy(DAG.getDataLayout()));
+ Args.emplace_back(
+ DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
+ PointerType::getUnqual(*DAG.getContext()));
+
+ RTLIB::Libcall LC =
+ IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
+ SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+ TLI.getPointerTy(DAG.getDataLayout()));
auto *RetTy = Type::getVoidTy(*DAG.getContext());
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy,
- Callee, std::move(Args));
+ TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
return TLI.LowerCallTo(CLI).second;
}
@@ -9014,14 +9074,28 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
CallConv = CallingConv::AArch64_SVE_VectorCall;
}
+ // Determine whether we need any streaming mode changes.
+ SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+ bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
+ bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
+ auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
+ // TODO: Handle agnostic ZA functions.
+ if (!UseNewSMEABILowering || IsAgnosticZAFunction)
+ return std::nullopt;
+ if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
+ return std::nullopt;
+ return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
+ : AArch64ISD::INOUT_ZA_USE;
+ }();
+
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(CLI);
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
- if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
- CallConv != CallingConv::SwiftTail)
+ if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
+ CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
IsSibCall = true;
if (IsTailCall)
@@ -9073,9 +9147,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
}
- // Determine whether we need any streaming mode changes.
- SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI);
-
auto DescribeCallsite =
[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
@@ -9089,7 +9160,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return R;
};
- bool RequiresLazySave = CallAttrs.requiresLazySave();
+ bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
@@ -9124,15 +9195,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
/*IsSave=*/true);
}
- SDValue PStateSM;
bool RequiresSMChange = CallAttrs.requiresSMChange();
if (RequiresSMChange) {
- if (CallAttrs.caller().hasStreamingInterfaceOrBody())
- PStateSM = DAG.getConstant(1, DL, MVT::i64);
- else if (CallAttrs.caller().hasNonStreamingInterface())
- PStateSM = DAG.getConstant(0, DL, MVT::i64);
- else
- PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
OptimizationRemarkEmitter ORE(&MF.getFunction());
ORE.emit([&]() {
auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
@@ -9171,10 +9235,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
- // Adjust the stack pointer for the new arguments...
+ // Adjust the stack pointer for the new arguments... and mark ZA uses.
// These operations are automatically eliminated by the prolog/epilog pass
- if (!IsSibCall)
+ assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
+ if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
+ if (ZAMarkerNode) {
+ // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
+ // using a chain can result in incorrect scheduling. The markers refer to
+ // the position just before the CALLSEQ_START (though occur after as
+ // CALLSEQ_START lacks in-glue).
+ Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
+ {Chain, Chain.getValue(1)});
+ }
+ }
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
@@ -9447,9 +9521,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Chain.getValue(1);
}
- SDValue NewChain = changeStreamingMode(
- DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
- getSMToggleCondition(CallAttrs), PStateSM);
+ SDValue NewChain =
+ changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
+ Chain, InGlue, getSMToggleCondition(CallAttrs));
Chain = NewChain.getValue(0);
InGlue = NewChain.getValue(1);
}
@@ -9633,10 +9707,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Result.getValue(Result->getNumValues() - 1);
if (RequiresSMChange) {
- assert(PStateSM && "Expected a PStateSM to be set");
Result = changeStreamingMode(
DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
- getSMToggleCondition(CallAttrs), PStateSM);
+ getSMToggleCondition(CallAttrs));
if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
InGlue = Result.getValue(1);
@@ -9646,7 +9719,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
- if (CallAttrs.requiresEnablingZAAfterCall())
+ if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
// Unconditionally resume ZA.
Result = DAG.getNode(
AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
@@ -9659,15 +9732,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {
// Conditionally restore the lazy save using a pseudo node.
+ RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
- TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+ TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
- "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
+ getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
SDValue TPIDR2_EL0 = DAG.getNode(
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
-
// Copy the address of the TPIDR2 block into X0 before 'calling' the
// RESTORE_ZA pseudo.
SDValue Glue;
@@ -9679,7 +9752,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
{Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
RestoreRoutine, RegMask, Result.getValue(1)});
-
// Finally reset the TPIDR2_EL0 register to 0.
Result = DAG.getNode(
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
@@ -9802,14 +9874,11 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Emit SMSTOP before returning from a locally streaming function
SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
- if (FuncAttrs.hasStreamingCompatibleInterface()) {
- Register Reg = FuncInfo->getPStateSMReg();
- assert(Reg.isValid() && "PStateSM Register is invalid");
- SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
+ if (FuncAttrs.hasStreamingCompatibleInterface())
Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
/*Glue*/ SDValue(),
- AArch64SME::IfCallerIsNonStreaming, PStateSM);
- } else
+ AArch64SME::IfCallerIsNonStreaming);
+ else
Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
/*Glue*/ SDValue(), AArch64SME::Always);
Glue = Chain.getValue(1);
@@ -17359,7 +17428,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -17369,7 +17438,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
- assert(!Mask && "Unexpected mask on a load");
+ assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
const DataLayout &DL = LI->getDataLayout();
@@ -28194,6 +28263,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
case Intrinsic::aarch64_sme_in_streaming_mode: {
SDLoc DL(N);
SDValue Chain = DAG.getEntryNode();
+
SDValue RuntimePStateSM =
getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
Results.push_back(
@@ -29004,7 +29074,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
// Checks to allow the use of SME instructions
if (auto *Base = dyn_cast<CallBase>(&Inst)) {
- auto CallAttrs = SMECallAttrs(*Base);
+ auto CallAttrs = SMECallAttrs(*Base, this);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingZT0() ||
CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 8887657..071e96e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -23,6 +23,8 @@
namespace llvm {
+class AArch64TargetMachine;
+
namespace AArch64 {
/// Possible values of current rounding mode, which is specified in bits
/// 23:22 of FPCR.
@@ -64,6 +66,8 @@ public:
explicit AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI);
+ const AArch64TargetMachine &getTM() const;
+
/// Control the following reassociation of operands: (op (op x, c1), y) -> (op
/// (op x, y), c1) where N0 is (op x, c1) and N1 is y.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
@@ -173,6 +177,10 @@ public:
MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode, bool Op0IsDef) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+
+ // Note: The following group of functions are only used as part of the old SME
+ // ABI lowering. They will be removed once -aarch64-new-sme-abi=true is the
+ // default.
MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI,
@@ -181,6 +189,8 @@ public:
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitEntryPStateSM(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
/// Replace (0, vreg) discriminator components with the operands of blend
/// or with (immediate, NoRegister) when possible.
@@ -220,8 +230,8 @@ public:
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
@@ -523,8 +533,8 @@ public:
/// node. \p Condition should be one of the enum values from
/// AArch64SME::ToggleCondition.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
- SDValue Chain, SDValue InGlue, unsigned Condition,
- SDValue PStateSM = SDValue()) const;
+ SDValue Chain, SDValue InGlue,
+ unsigned Condition) const;
bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d068a12..178dab6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -36,7 +36,12 @@ def DestructiveBinary : DestructiveInstTypeEnum<5>;
def DestructiveBinaryComm : DestructiveInstTypeEnum<6>;
def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>;
def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
-def DestructiveUnaryPassthru : DestructiveInstTypeEnum<9>;
+
+// 3 inputs unpredicated (reg1, reg2, imm).
+// Can be MOVPRFX'd iff reg1 == reg2.
+def Destructive2xRegImmUnpred : DestructiveInstTypeEnum<9>;
+
+def DestructiveUnaryPassthru : DestructiveInstTypeEnum<10>;
class FalseLanesEnum<bits<2> val> {
bits<2> Value = val;
@@ -7362,7 +7367,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
- asm#"2", ".8h", ".16b", ".16b", []>;
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+ (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>;
let Predicates = [HasAES] in {
def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
V128, V64, V64,
@@ -7374,10 +7381,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)),
(extract_high_v2i64 (v2i64 V128:$Rm))))]>;
}
-
- def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
- (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
- (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
}
multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
@@ -7402,87 +7405,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
(extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
- SDPatternOperator OpNode = null_frag> {
- def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
- V128, V64, V64,
- asm, ".8h", ".8b", ".8b",
- [(set (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
- def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
- V128, V128, V128,
- asm#"2", ".8h", ".16b", ".16b",
- [(set (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
- (extract_high_v16i8 (v16i8 V128:$Rm))))))]>;
- def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
- V128, V64, V64,
- asm, ".4s", ".4h", ".4h",
- [(set (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
- def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
- V128, V128, V128,
- asm#"2", ".4s", ".8h", ".8h",
- [(set (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
- (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
- def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
- V128, V64, V64,
- asm, ".2d", ".2s", ".2s",
- [(set (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
- def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
- V128, V128, V128,
- asm#"2", ".2d", ".4s", ".4s",
- [(set (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
- (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
- string asm,
- SDPatternOperator OpNode> {
- def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
- V128, V64, V64,
- asm, ".8h", ".8b", ".8b",
- [(set (v8i16 V128:$dst),
- (add (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
- def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
- V128, V128, V128,
- asm#"2", ".8h", ".16b", ".16b",
- [(set (v8i16 V128:$dst),
- (add (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
- (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>;
- def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
- V128, V64, V64,
- asm, ".4s", ".4h", ".4h",
- [(set (v4i32 V128:$dst),
- (add (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
- def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
- V128, V128, V128,
- asm#"2", ".4s", ".8h", ".8h",
- [(set (v4i32 V128:$dst),
- (add (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
- (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>;
- def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
- V128, V64, V64,
- asm, ".2d", ".2s", ".2s",
- [(set (v2i64 V128:$dst),
- (add (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
- def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
- V128, V128, V128,
- asm#"2", ".2d", ".4s", ".4s",
- [(set (v2i64 V128:$dst),
- (add (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
- (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
-}
-
+let isCommutable = 1 in
multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb59c9f..d15f90d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,9 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -83,6 +85,11 @@ static cl::opt<unsigned>
BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
cl::desc("Restrict range of B instructions (DEBUG)"));
+static cl::opt<unsigned> GatherOptSearchLimit(
+ "aarch64-search-limit", cl::Hidden, cl::init(2048),
+ cl::desc("Restrict range of instructions to search for the "
+ "machine-combiner gather pattern optimization"));
+
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
AArch64::CATCHRET),
@@ -5078,8 +5085,13 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
MCRegister DestRegX = TRI->getMatchingSuperReg(
DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
- MCRegister SrcRegX = TRI->getMatchingSuperReg(
- SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
+ assert(DestRegX.isValid() && "Destination super-reg not valid");
+ MCRegister SrcRegX =
+ SrcReg == AArch64::WZR
+ ? AArch64::XZR
+ : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
+ assert(SrcRegX.isValid() && "Source super-reg not valid");
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegX, but a proper
@@ -5920,7 +5932,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
// Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
SmallString<64> Expr;
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
- assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
+ assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
// Reg + NumBytes
Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
@@ -7412,11 +7424,319 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+/// to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+/// - 32-bit elements (LD1i32, 4 lanes total)
+/// - 16-bit elements (LD1i16, 8 lanes total)
+/// - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherLanePattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single pointer operand
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+ SmallVector<const MachineInstr *, 16> LoadInstrs;
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ LoadInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Check that we have found a match for lanes N-1.. 1.
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+ // If there is any chance of aliasing, do not apply the pattern.
+ // Walk backward through the MBB starting from Root.
+ // Exit early if we've encountered all load instructions or hit the search
+ // limit.
+ auto MBBItr = Root.getIterator();
+ unsigned RemainingSteps = GatherOptSearchLimit;
+ SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
+ RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
+ const MachineBasicBlock *MBB = Root.getParent();
+
+ for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
+ !RemainingLoadInstrs.empty();
+ --MBBItr, --RemainingSteps) {
+ const MachineInstr &CurrInstr = *MBBItr;
+
+ // Remove this instruction from remaining loads if it's one we're tracking.
+ RemainingLoadInstrs.erase(&CurrInstr);
+
+ // Check for potential aliasing with any of the load instructions to
+ // optimize.
+ if (CurrInstr.isLoadFoldBarrier())
+ return false;
+ }
+
+ // If we hit the search limit without finding all load instructions,
+ // don't match the pattern.
+ if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
+ return false;
+
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
+ return true;
+}
+
+/// Search for patterns of LD instructions we can optimize.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+/// Generate optimized instruction sequence for gather load patterns to improve
+/// Memory-Level Parallelism (MLP). This function transforms a chain of
+/// sequential NEON lane loads into parallel vector loads that can execute
+/// concurrently.
+static void
+generateGatherLanePattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern.
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Sort the load instructions according to the lane.
+ llvm::sort(LoadToLaneInstrs,
+ [](const MachineInstr *A, const MachineInstr *B) {
+ return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+ });
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ // Helper lambda to create a LD1 instruction.
+ auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister,
+ bool OffsetRegisterKillState) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on the NumLanes in the NEON
+ // register we are rewriting.
+ auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg,
+ bool KillState) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0);
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ Register PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+ PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+ OffsetRegOperand.getReg(),
+ OffsetRegOperand.isKill());
+ DelInstrs.push_back(LoadInstr);
+ }
+ Register LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform an integer load to zero out the upper
+ // lanes in a single instruction.
+ MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
+ MachineInstr *OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ Register DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ const MachineOperand &OriginalSplitToLoadOffsetOperand =
+ OriginalSplitLoad->getOperand(3);
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitToLoadOffsetOperand.getReg(),
+ OriginalSplitToLoadOffsetOperand.isKill());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+ PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+ OffsetRegOperand.getReg(),
+ OffsetRegOperand.isKill());
+
+ // Do not add the last reg to DelInstrs - it will be removed later.
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ Register LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7446,6 +7766,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8701,6 +9025,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+ generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+ generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+ generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da..70c814a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
@@ -820,7 +824,8 @@ enum DestructiveInstType {
DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6),
DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7),
DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8),
- DestructiveUnaryPassthru = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9),
+ Destructive2xRegImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9),
+ DestructiveUnaryPassthru = TSFLAG_DESTRUCTIVE_INST_TYPE(0xa),
};
enum FalseLaneType {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ac31236..4fa91a4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5707,27 +5707,6 @@ let Predicates = [HasFullFP16] in {
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
-// Match UABDL in log2-shuffle patterns.
-def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
- (zext (v8i8 V64:$opB))))),
- (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
- (zext (extract_high_v16i8 (v16i8 V128:$opB)))))),
- (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
- (zext (v4i16 V64:$opB))))),
- (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))),
- (zext (extract_high_v8i16 (v8i16 V128:$opB)))))),
- (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
- (zext (v2i32 V64:$opB))))),
- (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))),
- (zext (extract_high_v4i32 (v4i32 V128:$opB)))))),
- (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
-
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
@@ -6055,6 +6034,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+let isCommutable = 1 in
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
@@ -6802,40 +6782,47 @@ def : Pat <(f64 (uint_to_fp (i32
// Advanced SIMD three different-sized vector instructions.
//===----------------------------------------------------------------------===//
-defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
-defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
-defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
-defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
+defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+let isCommutable = 1 in
+defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
+defm SABAL : SIMDLongThreeVectorTiedBHS<0,0b0101,"sabal",
+ TriOpFrag<(add node:$LHS, (zext (abds node:$MHS, node:$RHS)))>>;
+defm SABDL : SIMDLongThreeVectorBHS<0, 0b0111, "sabdl",
+ BinOpFrag<(zext (abds node:$LHS, node:$RHS))>>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
- BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+ BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
- TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
- TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
- int_aarch64_neon_sqdmull>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>;
+let isCommutable = 0 in
defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
+defm UABAL : SIMDLongThreeVectorTiedBHS<1, 0b0101, "uabal",
+ TriOpFrag<(add node:$LHS, (zext (abdu node:$MHS, node:$RHS)))>>;
+defm UABDL : SIMDLongThreeVectorBHS<1, 0b0111, "uabdl",
+ BinOpFrag<(zext (abdu node:$LHS, node:$RHS))>>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
- TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
- TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
+let isCommutable = 0 in
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 782d62a7..e69fa32 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1193,7 +1193,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
// STRW %w0
Register Reg = getLdStRegOp(*I).getReg();
- for (MachineInstr &MI : make_range(std::next(I), Paired))
+ for (MachineInstr &MI :
+ make_range(std::next(I->getIterator()), Paired->getIterator()))
MI.clearRegisterKills(Reg, TRI);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 800787c..ed3374a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -213,9 +213,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// or return type
bool IsSVECC = false;
- /// The frame-index for the TPIDR2 object used for lazy saves.
- TPIDR2Object TPIDR2;
-
/// Whether this function changes streaming mode within the function.
bool HasStreamingModeChanges = false;
@@ -231,13 +228,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// on function entry to record the initial pstate of a function.
Register PStateSMReg = MCRegister::NoRegister;
- // Holds a pointer to a buffer that is large enough to represent
- // all SME ZA state and any additional state required by the
- // __arm_sme_save/restore support routines.
- Register SMESaveBufferAddr = MCRegister::NoRegister;
-
- // true if SMESaveBufferAddr is used.
- bool SMESaveBufferUsed = false;
+ // true if PStateSMReg is used.
+ bool PStateSMRegUsed = false;
// Has the PNReg used to build PTRUE instruction.
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
@@ -250,6 +242,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;
+ // Note: The following properties are only used for the old SME ABI lowering:
+ /// The frame-index for the TPIDR2 object used for lazy saves.
+ TPIDR2Object TPIDR2;
+ // Holds a pointer to a buffer that is large enough to represent
+ // all SME ZA state and any additional state required by the
+ // __arm_sme_save/restore support routines.
+ Register SMESaveBufferAddr = MCRegister::NoRegister;
+ // true if SMESaveBufferAddr is used.
+ bool SMESaveBufferUsed = false;
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -258,6 +260,13 @@ public:
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ // Old SME ABI lowering state getters/setters:
+ Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
+ void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
+ unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; };
+ void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; };
+ TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
+
void setPredicateRegForFillSpill(unsigned Reg) {
PredicateRegForFillSpill = Reg;
}
@@ -265,15 +274,12 @@ public:
return PredicateRegForFillSpill;
}
- Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
- void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
-
- unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; };
- void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; };
-
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; };
+ void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
+
int64_t getVGIdx() const { return VGIdx; };
void setVGIdx(unsigned Idx) { VGIdx = Idx; };
@@ -283,8 +289,6 @@ public:
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
- TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
-
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 1bc1d98..42eaeca 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -321,7 +321,6 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround]>;
@@ -335,7 +334,6 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -348,7 +346,6 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -361,7 +358,6 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -374,7 +370,6 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -392,7 +387,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -410,7 +404,6 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -428,7 +421,6 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -446,7 +438,6 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -463,7 +454,6 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
- FeatureZCRegMoveFPR64,
FeatureZCZeroing
]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index db27ca9..5c4e0c1 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,12 +39,25 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64EntryPStateSM
+ : SDNode<"AArch64ISD::ENTRY_PSTATE_SM", SDTypeProfile<1, 0,
+ [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in {
+ def EntryPStateSM : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+}
+def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
+
def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
[SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
[SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+//===----------------------------------------------------------------------===//
+// Old SME ABI lowering ISD nodes/pseudos (deprecated)
+//===----------------------------------------------------------------------===//
+
def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
[SDTCisInt<0>, SDTCisInt<1>]>,
[SDNPHasChain, SDNPSideEffect]>;
@@ -78,6 +91,30 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)),
(AllocateSMESaveBuffer $size)>;
//===----------------------------------------------------------------------===//
+// New SME ABI lowering ISD nodes/pseudos (-aarch64-new-sme-abi)
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isMeta = 1 in {
+ def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+ def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
+
+def CommitZASavePseudo
+ : Pseudo<(outs),
+ (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
+ Sched<[]>;
+
+def AArch64_inout_za_use
+ : SDNode<"AArch64ISD::INOUT_ZA_USE", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain, SDNPInGlue]>;
+def : Pat<(AArch64_inout_za_use), (InOutZAUsePseudo)>;
+
+def AArch64_requires_za_save
+ : SDNode<"AArch64ISD::REQUIRES_ZA_SAVE", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain, SDNPInGlue]>;
+def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
+
+//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0c4b4f4..509dd8b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1021,7 +1021,9 @@ let Predicates = [HasNonStreamingSVE_or_SME2p2] in {
let Predicates = [HasSVE_or_SME] in {
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
- defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
+ defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext, "EXT_ZZI_CONSTRUCTIVE">;
+
+ def EXT_ZZI_CONSTRUCTIVE : UnpredRegImmPseudo<ZPR8, imm0_255>;
defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>;
defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>;
@@ -2131,21 +2133,37 @@ let Predicates = [HasSVE_or_SME] in {
(LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
// Splice with lane bigger or equal to 0
- foreach VT = [nxv16i8] in
+ foreach VT = [nxv16i8] in {
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))),
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+ let AddedComplexity = 1 in
+ def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_255 i32:$index)))),
+ (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>;
+ }
- foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+ foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in {
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))),
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+ let AddedComplexity = 1 in
+ def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_127 i32:$index)))),
+ (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>;
+ }
- foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+ foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in {
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))),
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+ let AddedComplexity = 1 in
+ def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_63 i32:$index)))),
+ (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>;
+ }
- foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+ foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in {
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))),
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+ let AddedComplexity = 1 in
+ def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_31 i32:$index)))),
+ (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>;
+ }
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td
index 89ed1338..5ec95c7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA320.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td
@@ -847,7 +847,7 @@ def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]XTB_ZPmZ
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
// Extract narrow saturating
def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index 9456878..356e3fa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -825,7 +825,7 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
// Extract narrow saturating
def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
@@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_
def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
// Floating point compare
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
"^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
"^FCM(LE|LT)_PPzZ0_[HSD]",
"^FCMUO_PPzZZ_[HSD]")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 91a7079..e798222 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -1785,7 +1785,7 @@ def : InstRW<[N2Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]",
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
// Extract narrow saturating
def : InstRW<[N2Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index ecfb124..e44d40f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -1757,7 +1757,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
// Extract narrow saturating
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 3686654..44625a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -1575,7 +1575,7 @@ def : InstRW<[V1Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]",
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI)>;
+def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE)>;
// Extract/insert operation, SIMD and FP scalar form
def : InstRW<[V1Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index b2c3da0..6261220 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2272,7 +2272,7 @@ def : InstRW<[V2Write_2c_1V13], (instregex "^[SU]XTB_ZPmZ_[HSD]",
"^[SU]XTW_ZPmZ_[D]")>;
// Extract
-def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
// Extract narrow saturating
def : InstRW<[V2Write_4c_1V13], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 8a5b5ba..d3b1aa6 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -182,37 +182,25 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
const AArch64Subtarget &STI =
DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
const AArch64TargetLowering *TLI = STI.getTargetLowering();
- TargetLowering::ArgListEntry DstEntry;
- DstEntry.Ty = PointerType::getUnqual(*DAG.getContext());
- DstEntry.Node = Dst;
TargetLowering::ArgListTy Args;
- Args.push_back(DstEntry);
+ Args.emplace_back(Dst, PointerType::getUnqual(*DAG.getContext()));
RTLIB::Libcall NewLC;
switch (LC) {
case RTLIB::MEMCPY: {
NewLC = RTLIB::SC_MEMCPY;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = PointerType::getUnqual(*DAG.getContext());
- Entry.Node = Src;
- Args.push_back(Entry);
+ Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
break;
}
case RTLIB::MEMMOVE: {
NewLC = RTLIB::SC_MEMMOVE;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = PointerType::getUnqual(*DAG.getContext());
- Entry.Node = Src;
- Args.push_back(Entry);
+ Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
break;
}
case RTLIB::MEMSET: {
NewLC = RTLIB::SC_MEMSET;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = Type::getInt32Ty(*DAG.getContext());
- Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32);
- Entry.Node = Src;
- Args.push_back(Entry);
+ Args.emplace_back(DAG.getZExtOrTrunc(Src, DL, MVT::i32),
+ Type::getInt32Ty(*DAG.getContext()));
break;
}
default:
@@ -221,10 +209,7 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout());
SDValue Symbol = DAG.getExternalSymbol(TLI->getLibcallName(NewLC), PointerVT);
- TargetLowering::ArgListEntry SizeEntry;
- SizeEntry.Node = Size;
- SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
- Args.push_back(SizeEntry);
+ Args.emplace_back(Size, DAG.getDataLayout().getIntPtrType(*DAG.getContext()));
TargetLowering::CallLoweringInfo CLI(DAG);
PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index f136a184..a67bd42 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
ClMaxLifetimes);
if (StandardLifetime) {
IntrinsicInst *Start = Info.LifetimeStart[0];
- uint64_t Size =
- cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+ uint64_t Size = *Info.AI->getAllocationSize(*DL);
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), TagPCall, Size);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 95eab16..e67bd58 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -224,6 +224,11 @@ static cl::opt<bool>
cl::desc("Enable Machine Pipeliner for AArch64"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ EnableNewSMEABILowering("aarch64-new-sme-abi",
+ cl::desc("Enable new lowering for the SME ABI"),
+ cl::init(false), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeAArch64Target() {
// Register the target.
@@ -263,6 +268,7 @@ LLVMInitializeAArch64Target() {
initializeLDTLSCleanupPass(PR);
initializeKCFIPass(PR);
initializeSMEABIPass(PR);
+ initializeMachineSMEABIPass(PR);
initializeSMEPeepholeOptPass(PR);
initializeSVEIntrinsicOptsPass(PR);
initializeAArch64SpeculationHardeningPass(PR);
@@ -367,7 +373,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
computeDefaultCPU(TT, CPU), FS, Options,
getEffectiveRelocModel(TT, RM),
getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
- TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
+ TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian),
+ UseNewSMEABILowering(EnableNewSMEABILowering) {
initAsmInfo();
if (TT.isOSBinFormatMachO()) {
@@ -668,10 +675,12 @@ void AArch64PassConfig::addIRPasses() {
addPass(createInterleavedAccessPass());
}
- // Expand any functions marked with SME attributes which require special
- // changes for the calling convention or that require the lazy-saving
- // mechanism specified in the SME ABI.
- addPass(createSMEABIPass());
+ if (!EnableNewSMEABILowering) {
+ // Expand any functions marked with SME attributes which require special
+ // changes for the calling convention or that require the lazy-saving
+ // mechanism specified in the SME ABI.
+ addPass(createSMEABIPass());
+ }
// Add Control Flow Guard checks.
if (TM->getTargetTriple().isOSWindows()) {
@@ -782,6 +791,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
}
void AArch64PassConfig::addMachineSSAOptimization() {
+ if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
+ addPass(createMachineSMEABIPass());
+
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
addPass(createSMEPeepholeOptPass());
@@ -812,6 +824,9 @@ bool AArch64PassConfig::addILPOpts() {
}
void AArch64PassConfig::addPreRegAlloc() {
+ if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
+ addPass(createMachineSMEABIPass());
+
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableDeadRegisterElimination)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index b9e522d..0dd5d95 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -79,8 +79,12 @@ public:
size_t clearLinkerOptimizationHints(
const SmallPtrSetImpl<MachineInstr *> &MIs) const override;
+ /// Returns true if the new SME ABI lowering should be used.
+ bool useNewSMEABILowering() const { return UseNewSMEABILowering; }
+
private:
bool isLittle;
+ bool UseNewSMEABILowering;
};
// AArch64 little endian target machine.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f05add..b021968 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -220,20 +220,17 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
static cl::opt<bool> EnableScalableAutovecInStreamingMode(
"enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
-static bool isSMEABIRoutineCall(const CallInst &CI) {
+static bool isSMEABIRoutineCall(const CallInst &CI,
+ const AArch64TargetLowering &TLI) {
const auto *F = CI.getCalledFunction();
- return F && StringSwitch<bool>(F->getName())
- .Case("__arm_sme_state", true)
- .Case("__arm_tpidr2_save", true)
- .Case("__arm_tpidr2_restore", true)
- .Case("__arm_za_disable", true)
- .Default(false);
+ return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
}
/// Returns true if the function has explicit operations that can only be
/// lowered using incompatible instructions for the selected mode. This also
/// returns true if the function F may use or modify ZA state.
-static bool hasPossibleIncompatibleOps(const Function *F) {
+static bool hasPossibleIncompatibleOps(const Function *F,
+ const AArch64TargetLowering &TLI) {
for (const BasicBlock &BB : *F) {
for (const Instruction &I : BB) {
// Be conservative for now and assume that any call to inline asm or to
@@ -242,7 +239,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
// all native LLVM instructions can be lowered to compatible instructions.
if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
(cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
- isSMEABIRoutineCall(cast<CallInst>(I))))
+ isSMEABIRoutineCall(cast<CallInst>(I), TLI)))
return true;
}
}
@@ -290,7 +287,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
CallAttrs.requiresPreservingZT0() ||
CallAttrs.requiresPreservingAllZAState()) {
- if (hasPossibleIncompatibleOps(Callee))
+ if (hasPossibleIncompatibleOps(Callee, *getTLI()))
return false;
}
@@ -357,7 +354,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
// change only once and avoid inlining of G into F.
SMEAttrs FAttrs(*F);
- SMECallAttrs CallAttrs(Call);
+ SMECallAttrs CallAttrs(Call, getTLI());
if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
if (F == Call.getCaller()) // (1)
@@ -554,7 +551,17 @@ static bool isUnpackedVectorVT(EVT VecVT) {
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
}
-static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
+static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
+ const IntrinsicCostAttributes &ICA) {
+ // We need to know at least the number of elements in the vector of buckets
+ // and the size of each element to update.
+ if (ICA.getArgTypes().size() < 2)
+ return InstructionCost::getInvalid();
+
+ // Only interested in costing for the hardware instruction from SVE2.
+ if (!ST->hasSVE2())
+ return InstructionCost::getInvalid();
+
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
unsigned TotalHistCnts = 1;
@@ -579,9 +586,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
TotalHistCnts = EC / NaturalVectorWidth;
+
+ return InstructionCost(BaseHistCntCost * TotalHistCnts);
}
- return InstructionCost(BaseHistCntCost * TotalHistCnts);
+ return InstructionCost::getInvalid();
}
InstructionCost
@@ -597,10 +606,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return InstructionCost::getInvalid();
switch (ICA.getID()) {
- case Intrinsic::experimental_vector_histogram_add:
- if (!ST->hasSVE2())
- return InstructionCost::getInvalid();
- return getHistogramCost(ICA);
+ case Intrinsic::experimental_vector_histogram_add: {
+ InstructionCost HistCost = getHistogramCost(ST, ICA);
+ // If the cost isn't valid, we may still be able to scalarize
+ if (HistCost.isValid())
+ return HistCost;
+ break;
+ }
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
@@ -651,6 +663,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return LT.first;
break;
}
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd: {
+ // Given a fma or fmuladd, cost it the same as a fmul instruction which are
+ // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
+ Type *EltTy = RetTy->getScalarType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+ (EltTy->isHalfTy() && ST->hasFullFP16()))
+ return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
+ break;
+ }
case Intrinsic::stepvector: {
InstructionCost Cost = 1; // Cost of the `index' instruction
auto LT = getTypeLegalizationCost(RetTy);
@@ -3961,6 +3983,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
}
+InstructionCost
+AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+ TTI::TargetCostKind CostKind,
+ unsigned Index) const {
+ if (isa<FixedVectorType>(Val))
+ return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+ Index);
+
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize
+ ? 2
+ : ST->getVectorInsertExtractBaseCost() + 1;
+}
+
InstructionCost AArch64TTIImpl::getScalarizationOverhead(
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
TTI::TargetCostKind CostKind, bool ForPoisonSrc,
@@ -3975,6 +4015,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
}
+std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
+ Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const {
+ if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
+ return std::nullopt;
+ if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
+ return std::nullopt;
+
+ Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
+ InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
+ TTI::CastContextHint::None, CostKind);
+ if (!Op1Info.isConstant() && !Op2Info.isConstant())
+ Cost *= 2;
+ Cost += InstCost(PromotedTy);
+ if (IncludeTrunc)
+ Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+}
+
InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
@@ -3997,6 +4058,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // Increase the cost for half and bfloat types if not architecturally
+ // supported.
+ if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
+ ISD == ISD::FDIV || ISD == ISD::FREM)
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+ [&](Type *PromotedTy) {
+ return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
+ Op1Info, Op2Info);
+ }))
+ return *PromotedCost;
+
switch (ISD) {
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4265,11 +4338,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
[[fallthrough]];
case ISD::FADD:
case ISD::FSUB:
- // Increase the cost for half and bfloat types if not architecturally
- // supported.
- if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
- (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
- return 2 * LT.first;
if (!Ty->getScalarType()->isFP128Ty())
return LT.first;
[[fallthrough]];
@@ -4293,8 +4361,9 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
}
InstructionCost
-AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) const {
+AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -4302,7 +4371,7 @@ AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
int MaxMergeDistance = 64;
- if (Ty->isVectorTy() && SE &&
+ if (PtrTy->isVectorTy() && SE &&
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
return NumVectorInstToHideOverhead;
@@ -4371,25 +4440,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
}
if (Opcode == Instruction::FCmp) {
- // Without dedicated instructions we promote f16 + bf16 compares to f32.
- if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) ||
- ValTy->getScalarType()->isBFloatTy()) {
- Type *PromotedTy =
- ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext()));
- InstructionCost Cost =
- getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
- TTI::CastContextHint::None, CostKind);
- if (!Op1Info.isConstant() && !Op2Info.isConstant())
- Cost *= 2;
- Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
- Op1Info, Op2Info);
- if (ValTy->isVectorTy())
- Cost += getCastInstrCost(
- Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)),
- VectorType::getInteger(cast<VectorType>(PromotedTy)),
- TTI::CastContextHint::None, CostKind);
- return Cost;
- }
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
+ [&](Type *PromotedTy) {
+ InstructionCost Cost =
+ getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
+ CostKind, Op1Info, Op2Info);
+ if (isa<VectorType>(PromotedTy))
+ Cost += getCastInstrCost(
+ Instruction::Trunc,
+ VectorType::getInteger(cast<VectorType>(ValTy)),
+ VectorType::getInteger(cast<VectorType>(PromotedTy)),
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+ }))
+ return *PromotedCost;
auto LT = getTypeLegalizationCost(ValTy);
// Model unknown fp compares as a libcall.
@@ -4858,32 +4923,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Limit to loops with trip counts that are cheap to expand.
UP.SCEVExpansionBudget = 1;
- // Try to unroll small, single block loops, if they have load/store
- // dependencies, to expose more parallel memory access streams.
+ // Try to unroll small loops, of few-blocks with low budget, if they have
+ // load/store dependencies, to expose more parallel memory access streams,
+ // or if they do little work inside a block (i.e. load -> X -> store pattern).
BasicBlock *Header = L->getHeader();
- if (Header == L->getLoopLatch()) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (Header == Latch) {
// Estimate the size of the loop.
unsigned Size;
- if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
+ unsigned Width = 10;
+ if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
return;
- SmallPtrSet<Value *, 8> LoadedValues;
- SmallVector<StoreInst *> Stores;
- for (auto *BB : L->blocks()) {
- for (auto &I : *BB) {
- Value *Ptr = getLoadStorePointerOperand(&I);
- if (!Ptr)
- continue;
- const SCEV *PtrSCEV = SE.getSCEV(Ptr);
- if (SE.isLoopInvariant(PtrSCEV, L))
- continue;
- if (isa<LoadInst>(&I))
- LoadedValues.insert(&I);
- else
- Stores.push_back(cast<StoreInst>(&I));
- }
- }
-
// Try to find an unroll count that maximizes the use of the instruction
// window, i.e. trying to fetch as many instructions per cycle as possible.
unsigned MaxInstsPerLine = 16;
@@ -4902,8 +4953,32 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
UC++;
}
- if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
- return LoadedValues.contains(SI->getOperand(0));
+ if (BestUC == 1)
+ return;
+
+ SmallPtrSet<Value *, 8> LoadedValuesPlus;
+ SmallVector<StoreInst *> Stores;
+ for (auto *BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.isLoopInvariant(PtrSCEV, L))
+ continue;
+ if (isa<LoadInst>(&I)) {
+ LoadedValuesPlus.insert(&I);
+ // Include in-loop 1st users of loaded values.
+ for (auto *U : I.users())
+ if (L->contains(cast<Instruction>(U)))
+ LoadedValuesPlus.insert(U);
+ } else
+ Stores.push_back(cast<StoreInst>(&I));
+ }
+ }
+
+ if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
+ return LoadedValuesPlus.contains(SI->getOperand(0));
}))
return;
@@ -4915,7 +4990,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Try to runtime-unroll loops with early-continues depending on loop-varying
// loads; this helps with branch-prediction for the early-continues.
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
- auto *Latch = L->getLoopLatch();
SmallVector<BasicBlock *> Preds(predecessors(Latch));
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
!llvm::is_contained(Preds, Header) ||
@@ -5151,6 +5225,8 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
return false;
switch (RdxDesc.getRecurrenceKind()) {
+ case RecurKind::Sub:
+ case RecurKind::AddChainWithSubs:
case RecurKind::Add:
case RecurKind::FAdd:
case RecurKind::And:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 7f45177..42ae962 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -221,6 +221,11 @@ public:
unsigned Index) const override;
InstructionCost
+ getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+ TTI::TargetCostKind CostKind,
+ unsigned Index) const override;
+
+ InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
TTI::TargetCostKind CostKind) const override;
@@ -238,8 +243,9 @@ public:
ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
- InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getCmpSelInstrCost(
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
@@ -435,6 +441,14 @@ public:
bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); }
+ /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
+ /// architecture features are not present.
+ std::optional<InstructionCost>
+ getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const;
+
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 66136a4..803943f 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_target(AArch64CodeGen
SMEABIPass.cpp
SMEPeepholeOpt.cpp
SVEIntrinsicOpts.cpp
+ MachineSMEABIPass.cpp
AArch64SIMDInstrOpt.cpp
DEPENDS
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 2155ace..79bef76 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -128,9 +128,9 @@ struct AArch64OutgoingValueAssigner
if (!Flags.isVarArg() && !UseVarArgsCCForFixed) {
if (!IsReturn)
applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
- Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
} else
- Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
StackSize = State.getStackSize();
return Res;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index f359731..ee34a85 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1349,7 +1349,9 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True,
return &*SelectInst;
}
-static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
+static AArch64CC::CondCode
+changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
+ MachineRegisterInfo *MRI = nullptr) {
switch (P) {
default:
llvm_unreachable("Unknown condition code!");
@@ -1360,8 +1362,18 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
case CmpInst::ICMP_SGT:
return AArch64CC::GT;
case CmpInst::ICMP_SGE:
+ if (RHS && MRI) {
+ auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ if (ValAndVReg && ValAndVReg->Value == 0)
+ return AArch64CC::PL;
+ }
return AArch64CC::GE;
case CmpInst::ICMP_SLT:
+ if (RHS && MRI) {
+ auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ if (ValAndVReg && ValAndVReg->Value == 0)
+ return AArch64CC::MI;
+ }
return AArch64CC::LT;
case CmpInst::ICMP_SLE:
return AArch64CC::LE;
@@ -1813,7 +1825,8 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
auto &PredOp = ICmp.getOperand(1);
emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
- static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
+ static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
+ ICmp.getOperand(3).getReg(), MIB.getMRI());
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
I.eraseFromParent();
return true;
@@ -2510,8 +2523,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
/*RHS=*/Cmp->getOperand(3), PredOp, MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+ const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+ CmpInst::getInversePredicate(Pred), Cmp->getOperand(3).getReg(), &MRI);
emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
I.eraseFromParent();
return true;
@@ -3577,8 +3590,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
auto &PredOp = I.getOperand(1);
emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+ const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+ CmpInst::getInversePredicate(Pred), I.getOperand(3).getReg(), &MRI);
emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
/*Src2=*/AArch64::WZR, InvCC, MIB);
I.eraseFromParent();
@@ -4931,7 +4944,7 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
if (Negate)
CC = CmpInst::getInversePredicate(CC);
if (isa<GICmp>(Cmp)) {
- OutCC = changeICMPPredToAArch64CC(CC);
+ OutCC = changeICMPPredToAArch64CC(CC, RHS, MIB.getMRI());
} else {
// Handle special FP cases.
AArch64CC::CondCode ExtraCC;
@@ -5101,7 +5114,8 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp,
MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- CondCode = changeICMPPredToAArch64CC(Pred);
+ CondCode =
+ changeICMPPredToAArch64CC(Pred, CondDef->getOperand(3).getReg(), &MRI);
} else {
// Get the condition code for the select.
auto Pred =
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 3ba08c8..6025f1c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -614,8 +614,7 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
// x uge c => x ugt c - 1
//
// When c is not zero.
- if (C == 0)
- return std::nullopt;
+ assert(C != 0 && "C should not be zero here!");
P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
C -= 1;
break;
@@ -656,14 +655,13 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
if (isLegalArithImmed(C))
return {{C, P}};
- auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) {
+ auto NumberOfInstrToLoadImm = [=](uint64_t Imm) {
SmallVector<AArch64_IMM::ImmInsnModel> Insn;
AArch64_IMM::expandMOVImm(Imm, 32, Insn);
- return Insn.size() == 1;
+ return Insn.size();
};
- if (!IsMaterializableInSingleInstruction(OriginalC) &&
- IsMaterializableInSingleInstruction(C))
+ if (NumberOfInstrToLoadImm(OriginalC) > NumberOfInstrToLoadImm(C))
return {{C, P}};
return std::nullopt;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 45ac023..a388216 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -40,6 +40,7 @@ protected:
bool IsPCRel) const override;
bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override;
bool isNonILP32reloc(const MCFixup &Fixup, AArch64::Specifier RefKind) const;
+ void sortRelocs(std::vector<ELFRelocationEntry> &Relocs) override;
bool IsILP32;
};
@@ -498,6 +499,17 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val,
Val.getSpecifier());
}
+void AArch64ELFObjectWriter::sortRelocs(
+ std::vector<ELFRelocationEntry> &Relocs) {
+ // PATCHINST relocations should be applied last because they may overwrite the
+ // whole instruction and so should take precedence over other relocations that
+ // modify operands of the original instruction.
+ std::stable_partition(Relocs.begin(), Relocs.end(),
+ [](const ELFRelocationEntry &R) {
+ return R.Type != ELF::R_AARCH64_PATCHINST;
+ });
+}
+
std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 14547e3..917dbdf 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -35,7 +35,6 @@
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCWinCOFFStreamer.h"
#include "llvm/Support/AArch64BuildAttributes.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 3c8b571..54b58e9 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1017,14 +1017,22 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
else
return false;
+ StringRef Reg = getRegisterName(MI->getOperand(4).getReg());
+ bool NotXZR = Reg != "xzr";
+
+ // If a mandatory is not specified in the TableGen
+ // (i.e. no register operand should be present), and the register value
+ // is not xzr/x31, then disassemble to a SYS alias instead.
+ if (NotXZR && !NeedsReg)
+ return false;
+
std::string Str = Ins + Name;
llvm::transform(Str, Str.begin(), ::tolower);
O << '\t' << Str;
- if (NeedsReg) {
- O << ", ";
- printRegName(O, MI->getOperand(4).getReg());
- }
+
+ if (NeedsReg)
+ O << ", " << Reg;
return true;
}
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
new file mode 100644
index 0000000..b58dfdf
--- /dev/null
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -0,0 +1,696 @@
+//===- MachineSMEABIPass.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the SME ABI requirements for ZA state. This includes
+// implementing the lazy ZA state save schemes around calls.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass works by collecting instructions that require ZA to be in a
+// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
+// transitions to ensure ZA is in the required state before instructions. State
+// transitions represent actions such as setting up or restoring a lazy save.
+// Certain points within a function may also have predefined states independent
+// of any instructions, for example, a "shared_za" function is always entered
+// and exited in the "ACTIVE" state.
+//
+// To handle ZA state across control flow, we make use of edge bundling. This
+// assigns each block an "incoming" and "outgoing" edge bundle (representing
+// incoming and outgoing edges). Initially, these are unique to each block;
+// then, in the process of forming bundles, the outgoing block of a block is
+// joined with the incoming bundle of all successors. The result is that each
+// bundle can be assigned a single ZA state, which ensures the state required by
+// all a blocks' successors is the same, and that each basic block will always
+// be entered with the same ZA state. This eliminates the need for splitting
+// edges to insert state transitions or "phi" nodes for ZA states.
+//
+// See below for a simple example of edge bundling.
+//
+// The following shows a conditionally executed basic block (BB1):
+//
+// if (cond)
+// BB1
+// BB2
+//
+// Initial Bundles Joined Bundles
+//
+// ┌──0──┐ ┌──0──┐
+// │ BB0 │ │ BB0 │
+// └──1──┘ └──1──┘
+// ├───────┐ ├───────┐
+// ▼ │ ▼ │
+// ┌──2──┐ │ ─────► ┌──1──┐ │
+// │ BB1 │ ▼ │ BB1 │ ▼
+// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐
+// └───►4 BB2 │ └───►1 BB2 │
+// └──5──┘ └──2──┘
+//
+// On the left are the initial per-block bundles, and on the right are the
+// joined bundles (which are the result of the EdgeBundles analysis).
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-machine-sme-abi"
+
+namespace {
+
+enum ZAState {
+ // Any/unknown state (not valid)
+ ANY = 0,
+
+ // ZA is in use and active (i.e. within the accumulator)
+ ACTIVE,
+
+ // A ZA save has been set up or committed (i.e. ZA is dormant or off)
+ LOCAL_SAVED,
+
+ // ZA is off or a lazy save has been set up by the caller
+ CALLER_DORMANT,
+
+ // ZA is off
+ OFF,
+
+ // The number of ZA states (not a valid state)
+ NUM_ZA_STATE
+};
+
+/// A bitmask enum to record live physical registers that the "emit*" routines
+/// may need to preserve. Note: This only tracks registers we may clobber.
+enum LiveRegs : uint8_t {
+ None = 0,
+ NZCV = 1 << 0,
+ W0 = 1 << 1,
+ W0_HI = 1 << 2,
+ X0 = W0 | W0_HI,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
+};
+
+/// Holds the virtual registers live physical registers have been saved to.
+struct PhysRegSave {
+ LiveRegs PhysLiveRegs;
+ Register StatusFlags = AArch64::NoRegister;
+ Register X0Save = AArch64::NoRegister;
+};
+
+static bool isLegalEdgeBundleZAState(ZAState State) {
+ switch (State) {
+ case ZAState::ACTIVE:
+ case ZAState::LOCAL_SAVED:
+ return true;
+ default:
+ return false;
+ }
+}
+struct TPIDR2State {
+ int FrameIndex = -1;
+};
+
+StringRef getZAStateString(ZAState State) {
+#define MAKE_CASE(V) \
+ case V: \
+ return #V;
+ switch (State) {
+ MAKE_CASE(ZAState::ANY)
+ MAKE_CASE(ZAState::ACTIVE)
+ MAKE_CASE(ZAState::LOCAL_SAVED)
+ MAKE_CASE(ZAState::CALLER_DORMANT)
+ MAKE_CASE(ZAState::OFF)
+ default:
+ llvm_unreachable("Unexpected ZAState");
+ }
+#undef MAKE_CASE
+}
+
+static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI,
+ const MachineOperand &MO) {
+ if (!MO.isReg() || !MO.getReg().isPhysical())
+ return false;
+ return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
+ return AArch64::MPR128RegClass.contains(SR) ||
+ AArch64::ZTRRegClass.contains(SR);
+ });
+}
+
+/// Returns the required ZA state needed before \p MI and an iterator pointing
+/// to where any code required to change the ZA state should be inserted.
+static std::pair<ZAState, MachineBasicBlock::iterator>
+getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
+ bool ZAOffAtReturn) {
+ MachineBasicBlock::iterator InsertPt(MI);
+
+ if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
+ return {ZAState::ACTIVE, std::prev(InsertPt)};
+
+ if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
+ return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
+
+ if (MI.isReturn())
+ return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
+
+ for (auto &MO : MI.operands()) {
+ if (isZAorZT0RegOp(TRI, MO))
+ return {ZAState::ACTIVE, InsertPt};
+ }
+
+ return {ZAState::ANY, InsertPt};
+}
+
+struct MachineSMEABI : public MachineFunctionPass {
+ inline static char ID = 0;
+
+ MachineSMEABI() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "Machine SME ABI pass"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundlesWrapperLegacy>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ /// Collects the needed ZA state (and live registers) before each instruction
+ /// within the machine function.
+ void collectNeededZAStates(SMEAttrs);
+
+ /// Assigns each edge bundle a ZA state based on the needed states of blocks
+ /// that have incoming or outgoing edges in that bundle.
+ void assignBundleZAStates();
+
+ /// Inserts code to handle changes between ZA states within the function.
+ /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
+ void insertStateChanges();
+
+ // Emission routines for private and shared ZA functions (using lazy saves).
+ void emitNewZAPrologue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
+ void emitRestoreLazySave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+ void emitSetupLazySave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
+ void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
+ void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ bool ClearTPIDR2);
+
+ void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+
+ /// Save live physical registers to virtual registers.
+ PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL);
+ /// Restore physical registers from a save of their previous values.
+ void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL);
+
+ /// Get or create a TPIDR2 block in this function.
+ TPIDR2State getTPIDR2Block();
+
+private:
+ /// Contains the needed ZA state (and live registers) at an instruction.
+ struct InstInfo {
+ ZAState NeededState{ZAState::ANY};
+ MachineBasicBlock::iterator InsertPt;
+ LiveRegs PhysLiveRegs = LiveRegs::None;
+ };
+
+ /// Contains the needed ZA state for each instruction in a block.
+ /// Instructions that do not require a ZA state are not recorded.
+ struct BlockInfo {
+ ZAState FixedEntryState{ZAState::ANY};
+ SmallVector<InstInfo> Insts;
+ LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
+ };
+
+ // All pass state that must be cleared between functions.
+ struct PassState {
+ SmallVector<BlockInfo> Blocks;
+ SmallVector<ZAState> BundleStates;
+ std::optional<TPIDR2State> TPIDR2Block;
+ } State;
+
+ MachineFunction *MF = nullptr;
+ EdgeBundles *Bundles = nullptr;
+ const AArch64Subtarget *Subtarget = nullptr;
+ const AArch64RegisterInfo *TRI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+};
+
+void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
+ assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+ "Expected function to have ZA/ZT0 state!");
+
+ State.Blocks.resize(MF->getNumBlockIDs());
+ for (MachineBasicBlock &MBB : *MF) {
+ BlockInfo &Block = State.Blocks[MBB.getNumber()];
+ if (&MBB == &MF->front()) {
+ // Entry block:
+ Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
+ ? ZAState::CALLER_DORMANT
+ : ZAState::ACTIVE;
+ } else if (MBB.isEHPad()) {
+ // EH entry block:
+ Block.FixedEntryState = ZAState::LOCAL_SAVED;
+ }
+
+ LiveRegUnits LiveUnits(*TRI);
+ LiveUnits.addLiveOuts(MBB);
+
+ auto GetPhysLiveRegs = [&] {
+ LiveRegs PhysLiveRegs = LiveRegs::None;
+ if (!LiveUnits.available(AArch64::NZCV))
+ PhysLiveRegs |= LiveRegs::NZCV;
+ // We have to track W0 and X0 separately as otherwise things can get
+ // confused if we attempt to preserve X0 but only W0 was defined.
+ if (!LiveUnits.available(AArch64::W0))
+ PhysLiveRegs |= LiveRegs::W0;
+ if (!LiveUnits.available(AArch64::W0_HI))
+ PhysLiveRegs |= LiveRegs::W0_HI;
+ return PhysLiveRegs;
+ };
+
+ Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
+ auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+ for (MachineInstr &MI : reverse(MBB)) {
+ MachineBasicBlock::iterator MBBI(MI);
+ LiveUnits.stepBackward(MI);
+ LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+ auto [NeededState, InsertPt] = getZAStateBeforeInst(
+ *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
+ assert((InsertPt == MBBI ||
+ InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
+ "Unexpected state change insertion point!");
+ // TODO: Do something to avoid state changes where NZCV is live.
+ if (MBBI == FirstTerminatorInsertPt)
+ Block.PhysLiveRegsAtExit = PhysLiveRegs;
+ if (NeededState != ZAState::ANY)
+ Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
+ }
+
+ // Reverse vector (as we had to iterate backwards for liveness).
+ std::reverse(Block.Insts.begin(), Block.Insts.end());
+ }
+}
+
+void MachineSMEABI::assignBundleZAStates() {
+ State.BundleStates.resize(Bundles->getNumBundles());
+ for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
+ LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
+
+ // Attempt to assign a ZA state for this bundle that minimizes state
+ // transitions. Edges within loops are given a higher weight as we assume
+ // they will be executed more than once.
+ // TODO: We should propagate desired incoming/outgoing states through blocks
+ // that have the "ANY" state first to make better global decisions.
+ int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
+ for (unsigned BlockID : Bundles->getBlocks(I)) {
+ LLVM_DEBUG(dbgs() << "- bb." << BlockID);
+
+ const BlockInfo &Block = State.Blocks[BlockID];
+ if (Block.Insts.empty()) {
+ LLVM_DEBUG(dbgs() << " (no state preference)\n");
+ continue;
+ }
+ bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
+ bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
+
+ ZAState DesiredIncomingState = Block.Insts.front().NeededState;
+ if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
+ EdgeStateCounts[DesiredIncomingState]++;
+ LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
+ << getZAStateString(DesiredIncomingState));
+ }
+ ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
+ if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
+ EdgeStateCounts[DesiredOutgoingState]++;
+ LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
+ << getZAStateString(DesiredOutgoingState));
+ }
+ LLVM_DEBUG(dbgs() << '\n');
+ }
+
+ ZAState BundleState =
+ ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
+
+ // Force ZA to be active in bundles that don't have a preferred state.
+ // TODO: Something better here (to avoid extra mode switches).
+ if (BundleState == ZAState::ANY)
+ BundleState = ZAState::ACTIVE;
+
+ LLVM_DEBUG({
+ dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
+ << "Edge counts:";
+ for (auto [State, Count] : enumerate(EdgeStateCounts))
+ dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
+ dbgs() << "\n\n";
+ });
+
+ State.BundleStates[I] = BundleState;
+ }
+}
+
+void MachineSMEABI::insertStateChanges() {
+ for (MachineBasicBlock &MBB : *MF) {
+ const BlockInfo &Block = State.Blocks[MBB.getNumber()];
+ ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
+ /*Out=*/false)];
+
+ ZAState CurrentState = Block.FixedEntryState;
+ if (CurrentState == ZAState::ANY)
+ CurrentState = InState;
+
+ for (auto &Inst : Block.Insts) {
+ if (CurrentState != Inst.NeededState)
+ emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
+ Inst.PhysLiveRegs);
+ CurrentState = Inst.NeededState;
+ }
+
+ if (MBB.succ_empty())
+ continue;
+
+ ZAState OutState =
+ State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
+ if (CurrentState != OutState)
+ emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
+ Block.PhysLiveRegsAtExit);
+ }
+}
+
+TPIDR2State MachineSMEABI::getTPIDR2Block() {
+ if (State.TPIDR2Block)
+ return *State.TPIDR2Block;
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)};
+ return *State.TPIDR2Block;
+}
+
+static DebugLoc getDebugLoc(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ if (MBBI != MBB.end())
+ return MBBI->getDebugLoc();
+ return DebugLoc();
+}
+
+void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+ // Get pointer to TPIDR2 block.
+ Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
+ Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
+ .addFrameIndex(getTPIDR2Block().FrameIndex)
+ .addImm(0)
+ .addImm(0);
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
+ .addReg(TPIDR2);
+ // Set TPIDR2_EL0 to point to TPIDR2 block.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::TPIDR2_EL0)
+ .addReg(TPIDR2Ptr);
+}
+
+PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) {
+ PhysRegSave RegSave{PhysLiveRegs};
+ if (PhysLiveRegs & LiveRegs::NZCV) {
+ RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(AArch64::NZCV, RegState::Implicit);
+ }
+ // Note: Preserving X0 is "free" as this is before register allocation, so
+ // the register allocator is still able to optimize these copies.
+ if (PhysLiveRegs & LiveRegs::W0) {
+ RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
+ ? &AArch64::GPR64RegClass
+ : &AArch64::GPR32RegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
+ .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
+ }
+ return RegSave;
+}
+
+void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) {
+ if (RegSave.StatusFlags != AArch64::NoRegister)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(RegSave.StatusFlags)
+ .addReg(AArch64::NZCV, RegState::ImplicitDefine);
+
+ if (RegSave.X0Save != AArch64::NoRegister)
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
+ RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
+ .addReg(RegSave.X0Save);
+}
+
+void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ auto *TLI = Subtarget->getTargetLowering();
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register TPIDR2 = AArch64::X0;
+
+ // TODO: Emit these within the restore MBB to prevent unnecessary saves.
+ PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+ // Enable ZA.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+ .addImm(AArch64SVCR::SVCRZA)
+ .addImm(1);
+ // Get current TPIDR2_EL0.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
+ .addImm(AArch64SysReg::TPIDR2_EL0);
+ // Get pointer to TPIDR2 block.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
+ .addFrameIndex(getTPIDR2Block().FrameIndex)
+ .addImm(0)
+ .addImm(0);
+ // (Conditionally) restore ZA state.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
+ .addReg(TPIDR2EL0)
+ .addReg(TPIDR2)
+ .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
+ .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+ // Zero TPIDR2_EL0.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::TPIDR2_EL0)
+ .addReg(AArch64::XZR);
+
+ restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
+void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ bool ClearTPIDR2) {
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+ if (ClearTPIDR2)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::TPIDR2_EL0)
+ .addReg(AArch64::XZR);
+
+ // Disable ZA.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+ .addImm(AArch64SVCR::SVCRZA)
+ .addImm(0);
+}
+
+void MachineSMEABI::emitAllocateLazySaveBuffer(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+ // Calculate SVL.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
+
+ // 1. Allocate the lazy save buffer.
+ {
+ // TODO This function grows the stack with a subtraction, which doesn't work
+ // on Windows. Some refactoring to share the functionality in
+ // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
+ // supports SME
+ assert(!Subtarget->isTargetWindows() &&
+ "Lazy ZA save is not yet supported on Windows");
+ // Get original stack pointer.
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
+ .addReg(AArch64::SP);
+ // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
+ .addReg(SVL)
+ .addReg(SVL)
+ .addReg(SP);
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
+ .addReg(Buffer);
+ // We have just allocated a variable sized object, tell this to PEI.
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+
+ // 2. Setup the TPIDR2 block.
+ {
+ // Note: This case just needs to do `SVL << 48`. It is not implemented as we
+ // generally don't support big-endian SVE/SME.
+ if (!Subtarget->isLittleEndian())
+ reportFatalInternalError(
+ "TPIDR2 block initialization is not supported on big-endian targets");
+
+ // Store buffer pointer and num_za_save_slices.
+ // Bytes 10-15 are implicitly zeroed.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
+ .addReg(Buffer)
+ .addReg(SVL)
+ .addFrameIndex(getTPIDR2Block().FrameIndex)
+ .addImm(0);
+ }
+}
+
+void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ auto *TLI = Subtarget->getTargetLowering();
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+
+ // Get current TPIDR2_EL0.
+ Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
+ .addReg(TPIDR2EL0, RegState::Define)
+ .addImm(AArch64SysReg::TPIDR2_EL0);
+ // If TPIDR2_EL0 is non-zero, commit the lazy save.
+ // NOTE: Functions that only use ZT0 don't need to zero ZA.
+ bool ZeroZA =
+ MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
+ auto CommitZASave =
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
+ .addReg(TPIDR2EL0)
+ .addImm(ZeroZA ? 1 : 0)
+ .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
+ .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+ if (ZeroZA)
+ CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
+ // Enable ZA (as ZA could have previously been in the OFF state).
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
+ .addImm(AArch64SVCR::SVCRZA)
+ .addImm(1);
+}
+
+void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ ZAState From, ZAState To,
+ LiveRegs PhysLiveRegs) {
+
+ // ZA not used.
+ if (From == ZAState::ANY || To == ZAState::ANY)
+ return;
+
+ // If we're exiting from the CALLER_DORMANT state that means this new ZA
+ // function did not touch ZA (so ZA was never turned on).
+ if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF)
+ return;
+
+ // TODO: Avoid setting up the save buffer if there's no transition to
+ // LOCAL_SAVED.
+ if (From == ZAState::CALLER_DORMANT) {
+ assert(MBB.getParent()
+ ->getInfo<AArch64FunctionInfo>()
+ ->getSMEFnAttrs()
+ .hasPrivateZAInterface() &&
+ "CALLER_DORMANT state requires private ZA interface");
+ assert(&MBB == &MBB.getParent()->front() &&
+ "CALLER_DORMANT state only valid in entry block");
+ emitNewZAPrologue(MBB, MBB.getFirstNonPHI());
+ if (To == ZAState::ACTIVE)
+ return; // Nothing more to do (ZA is active after the prologue).
+
+ // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
+ // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
+ // case by changing the placement of the zero instruction.
+ From = ZAState::ACTIVE;
+ }
+
+ if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
+ emitSetupLazySave(MBB, InsertPt);
+ else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
+ emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+ else if (To == ZAState::OFF) {
+ assert(From != ZAState::CALLER_DORMANT &&
+ "CALLER_DORMANT to OFF should have already been handled");
+ emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
+ } else {
+ dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
+ << getZAStateString(To) << '\n';
+ llvm_unreachable("Unimplemented state transition");
+ }
+}
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
+ false, false)
+
+bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
+ if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
+ return false;
+
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
+ if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+ return false;
+
+ assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
+
+ // Reset pass state.
+ State = PassState{};
+ this->MF = &MF;
+ Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+ Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+ TII = Subtarget->getInstrInfo();
+ TRI = Subtarget->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ collectNeededZAStates(SMEFnAttrs);
+ assignBundleZAStates();
+ insertStateChanges();
+
+ // Allocate save buffer (if needed).
+ if (State.TPIDR2Block) {
+ MachineBasicBlock &EntryBlock = MF.front();
+ emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ }
+
+ return true;
+}
+
+FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 4af4d49..2008516 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -15,11 +15,16 @@
#include "AArch64.h"
#include "Utils/AArch64SMEAttributes.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
using namespace llvm;
@@ -33,9 +38,13 @@ struct SMEABI : public FunctionPass {
bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
+
private:
bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
- SMEAttrs FnAttrs);
+ SMEAttrs FnAttrs, const TargetLowering &TLI);
};
} // end anonymous namespace
@@ -51,14 +60,16 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
//===----------------------------------------------------------------------===//
// Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0.
-void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
+void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
+ bool ZT0IsUndef = false) {
auto &Ctx = M->getContext();
auto *TPIDR2SaveTy =
FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
auto Attrs =
AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible");
+ RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE;
FunctionCallee Callee =
- M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
+ M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs);
CallInst *Call = Builder.CreateCall(Callee);
// If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark
@@ -67,8 +78,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
if (ZT0IsUndef)
Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef"));
- Call->setCallingConv(
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0);
+ Call->setCallingConv(TLI.getLibcallCallingConv(LC));
// A save to TPIDR2 should be followed by clearing TPIDR2_EL0.
Function *WriteIntr =
@@ -98,7 +108,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
/// interface if it does not share ZA or ZT0.
///
bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
- IRBuilder<> &Builder, SMEAttrs FnAttrs) {
+ IRBuilder<> &Builder, SMEAttrs FnAttrs,
+ const TargetLowering &TLI) {
LLVMContext &Context = F->getContext();
BasicBlock *OrigBB = &F->getEntryBlock();
Builder.SetInsertPoint(&OrigBB->front());
@@ -124,7 +135,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
// Create a call __arm_tpidr2_save, which commits the lazy save.
Builder.SetInsertPoint(&SaveBB->back());
- emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
+ emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
// Enable pstate.za at the start of the function.
Builder.SetInsertPoint(&OrigBB->front());
@@ -172,10 +183,14 @@ bool SMEABI::runOnFunction(Function &F) {
if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za"))
return false;
+ const TargetMachine &TM =
+ getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+ const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
+
bool Changed = false;
SMEAttrs FnAttrs(F);
if (FnAttrs.isNewZA() || FnAttrs.isNewZT0())
- Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
+ Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI);
return Changed;
}
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index bd28716..564af67 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -80,16 +80,10 @@ static bool isMatchingStartStopPair(const MachineInstr *MI1,
if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask())
return false;
- // This optimisation is unlikely to happen in practice for conditional
- // smstart/smstop pairs as the virtual registers for pstate.sm will always
- // be different.
- // TODO: For this optimisation to apply to conditional smstart/smstop,
- // this pass will need to do more work to remove redundant calls to
- // __arm_sme_state.
-
// Only consider conditional start/stop pairs which read the same register
- // holding the original value of pstate.sm, as some conditional start/stops
- // require the state on entry to the function.
+ // holding the original value of pstate.sm. This is somewhat over conservative
+ // as all conditional streaming mode changes only look at the state on entry
+ // to the function.
if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) {
Register Reg1 = MI1->getOperand(3).getReg();
Register Reg2 = MI2->getOperand(3).getReg();
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a0320f9..a3a7d0f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -809,6 +809,11 @@ let hasNoSchedulingInfo = 1 in {
Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> {
let FalseLanes = flags;
}
+
+ class UnpredRegImmPseudo<ZPRRegOp zprty, Operand immty>
+ : SVEPseudo2Instr<NAME, 0>,
+ Pseudo<(outs zprty:$Zd), (ins zprty:$Zs, immty:$imm), []> {
+ }
}
//
@@ -1885,13 +1890,14 @@ class sve_int_perm_extract_i<string asm>
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
- let DestructiveInstType = DestructiveOther;
+ let DestructiveInstType = Destructive2xRegImmUnpred;
let ElementSize = ElementSizeNone;
let hasSideEffects = 0;
}
-multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op> {
- def NAME : sve_int_perm_extract_i<asm>;
+multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op, string Ps> {
+ def NAME : sve_int_perm_extract_i<asm>,
+ SVEPseudo2Instr<Ps, 1>;
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, imm0_255,
!cast<Instruction>(NAME)>;
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 271094f..dd6fa16 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -7,17 +7,14 @@
//===----------------------------------------------------------------------===//
#include "AArch64SMEAttributes.h"
+#include "AArch64ISelLowering.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/RuntimeLibcalls.h"
#include <cassert>
using namespace llvm;
-void SMEAttrs::set(unsigned M, bool Enable) {
- if (Enable)
- Bitmask |= M;
- else
- Bitmask &= ~M;
-
+void SMEAttrs::validate() const {
// Streaming Mode Attrs
assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) &&
"SM_Enabled and SM_Compatible are mutually exclusive");
@@ -77,19 +74,36 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
Bitmask |= encodeZT0State(StateValue::New);
}
-void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) {
+void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
+ const AArch64TargetLowering &TLI) {
+ RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
+ if (Impl == RTLIB::Unsupported)
+ return;
unsigned KnownAttrs = SMEAttrs::Normal;
- if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
- KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
- if (FuncName == "__arm_tpidr2_restore")
+ RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+ switch (LC) {
+ case RTLIB::SMEABI_SME_STATE:
+ case RTLIB::SMEABI_TPIDR2_SAVE:
+ case RTLIB::SMEABI_GET_CURRENT_VG:
+ case RTLIB::SMEABI_SME_STATE_SIZE:
+ case RTLIB::SMEABI_SME_SAVE:
+ case RTLIB::SMEABI_SME_RESTORE:
+ KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
+ break;
+ case RTLIB::SMEABI_ZA_DISABLE:
+ case RTLIB::SMEABI_TPIDR2_RESTORE:
KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
SMEAttrs::SME_ABI_Routine;
- if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
- FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
+ break;
+ case RTLIB::SC_MEMCPY:
+ case RTLIB::SC_MEMMOVE:
+ case RTLIB::SC_MEMSET:
+ case RTLIB::SC_MEMCHR:
KnownAttrs |= SMEAttrs::SM_Compatible;
- if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" ||
- FuncName == "__arm_sme_state_size")
- KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
+ break;
+ default:
+ break;
+ }
set(KnownAttrs);
}
@@ -110,11 +124,11 @@ bool SMECallAttrs::requiresSMChange() const {
return true;
}
-SMECallAttrs::SMECallAttrs(const CallBase &CB)
+SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
: CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
if (auto *CalledFunction = CB.getCalledFunction())
- CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes);
+ CalledFn = SMEAttrs(*CalledFunction, TLI);
// FIXME: We probably should not allow SME attributes on direct calls but
// clang duplicates streaming mode attributes at each callsite.
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index f1be0ecb..d26e3cd 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -13,6 +13,8 @@
namespace llvm {
+class AArch64TargetLowering;
+
class Function;
class CallBase;
class AttributeList;
@@ -48,19 +50,27 @@ public:
CallSiteFlags_Mask = ZT0_Undef
};
- enum class InferAttrsFromName { No, Yes };
-
SMEAttrs() = default;
SMEAttrs(unsigned Mask) { set(Mask); }
- SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No)
+ SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr)
: SMEAttrs(F.getAttributes()) {
- if (Infer == InferAttrsFromName::Yes)
- addKnownFunctionAttrs(F.getName());
+ if (TLI)
+ addKnownFunctionAttrs(F.getName(), *TLI);
}
SMEAttrs(const AttributeList &L);
- SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); };
+ SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) {
+ addKnownFunctionAttrs(FuncName, TLI);
+ };
- void set(unsigned M, bool Enable = true);
+ void set(unsigned M, bool Enable = true) {
+ if (Enable)
+ Bitmask |= M;
+ else
+ Bitmask &= ~M;
+#ifndef NDEBUG
+ validate();
+#endif
+ }
// Interfaces to query PSTATE.SM
bool hasStreamingBody() const { return Bitmask & SM_Body; }
@@ -146,7 +156,9 @@ public:
}
private:
- void addKnownFunctionAttrs(StringRef FuncName);
+ void addKnownFunctionAttrs(StringRef FuncName,
+ const AArch64TargetLowering &TLI);
+ void validate() const;
};
/// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has
@@ -163,7 +175,7 @@ public:
SMEAttrs Callsite = SMEAttrs::Normal)
: CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
- SMECallAttrs(const CallBase &CB);
+ SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI);
SMEAttrs &caller() { return CallerFn; }
SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
@@ -194,7 +206,7 @@ public:
}
bool requiresEnablingZAAfterCall() const {
- return requiresLazySave() || requiresDisablingZABeforeCall();
+ return requiresDisablingZABeforeCall();
}
bool requiresPreservingAllZAState() const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481..0059a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
-struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &);
extern char &SIOptimizeExecMaskingPreRAID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f266398..8e4b636 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
"gfx12",
- [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
+ [FeatureFP64, FeatureMIMG_R128,
FeatureFlatAddressSpace, Feature16BitInsts,
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
+ FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureCUStores,
+ FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..69722bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -41,6 +41,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
@@ -719,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
IsLocal),
+ RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
+ OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -733,6 +736,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutContext, IsLocal));
}
+ // Emit _dvgpr$ symbol when appropriate.
+ emitDVgprSymbol(MF);
+
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -803,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
}
+ if (AMDGPU::isGFX1250(STM)) {
+ const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
+ const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
+ CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
+ const MCExpr *BarBlks =
+ MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
+ OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
+ false);
+ }
+
OutStreamer->emitRawComment(
" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
@@ -875,6 +891,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
+// When appropriate, add a _dvgpr$ symbol, with the value of the function
+// symbol, plus an offset encoding one less than the number of VGPR blocks used
+// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
+// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
+// used by a front-end to have functions that are chained rather than called,
+// and a dispatcher that dynamically resizes the VGPR count before dispatching
+// to a function.
+void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI.isDynamicVGPREnabled() &&
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
+ MCContext &Ctx = MF.getContext();
+ unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
+ MCValue NumVGPRs;
+ if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
+ NumVGPRs, nullptr) ||
+ !NumVGPRs.isAbsolute()) {
+ llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
+ }
+ // Calculate number of VGPR blocks.
+ // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
+ unsigned NumBlocks =
+ divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
+
+ if (NumBlocks > 8) {
+ OutContext.reportError({},
+ "too many DVGPR blocks for _dvgpr$ symbol for '" +
+ Twine(CurrentFnSym->getName()) + "'");
+ return;
+ }
+ unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
+ // Add to function symbol to create _dvgpr$ symbol.
+ const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(CurrentFnSym, Ctx),
+ MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
+ MCSymbol *DVgprFuncSym =
+ Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
+ OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
+ emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
+ emitLinkage(&MF.getFunction(), DVgprFuncSym);
+ }
+}
+
// TODO: Fold this into emitFunctionBodyStart.
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
@@ -964,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DynamicCallStack =
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
+ ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -997,89 +1057,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
- // dispatch registers are function args.
- unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
- if (isShader(F.getCallingConv())) {
- bool IsPixelShader =
- F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
- // Calculate the number of VGPR registers based on the SPI input registers
- uint32_t InputEna = 0;
- uint32_t InputAddr = 0;
- unsigned LastEna = 0;
-
- if (IsPixelShader) {
- // Note for IsPixelShader:
- // By this stage, all enabled inputs are tagged in InputAddr as well.
- // We will use InputAddr to determine whether the input counts against the
- // vgpr total and only use the InputEnable to determine the last input
- // that is relevant - if extra arguments are used, then we have to honour
- // the InputAddr for any intermediate non-enabled inputs.
- InputEna = MFI->getPSInputEnable();
- InputAddr = MFI->getPSInputAddr();
-
- // We only need to consider input args up to the last used arg.
- assert((InputEna || InputAddr) &&
- "PSInputAddr and PSInputEnable should "
- "never both be 0 for AMDGPU_PS shaders");
- // There are some rare circumstances where InputAddr is non-zero and
- // InputEna can be set to 0. In this case we default to setting LastEna
- // to 1.
- LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
- }
+ // dispatch registers as function args.
+ unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+ WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
- // FIXME: We should be using the number of registers determined during
- // calling convention lowering to legalize the types.
- const DataLayout &DL = F.getDataLayout();
- unsigned PSArgCount = 0;
- unsigned IntermediateVGPR = 0;
- for (auto &Arg : F.args()) {
- unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg)) {
- WaveDispatchNumSGPR += NumRegs;
- } else {
- // If this is a PS shader and we're processing the PS Input args (first
- // 16 VGPR), use the InputEna and InputAddr bits to define how many
- // VGPRs are actually used.
- // Any extra VGPR arguments are handled as normal arguments (and
- // contribute to the VGPR count whether they're used or not).
- if (IsPixelShader && PSArgCount < 16) {
- if ((1 << PSArgCount) & InputAddr) {
- if (PSArgCount < LastEna)
- WaveDispatchNumVGPR += NumRegs;
- else
- IntermediateVGPR += NumRegs;
- }
- PSArgCount++;
- } else {
- // If there are extra arguments we have to include the allocation for
- // the non-used (but enabled with InputAddr) input arguments
- if (IntermediateVGPR) {
- WaveDispatchNumVGPR += IntermediateVGPR;
- IntermediateVGPR = 0;
- }
- WaveDispatchNumVGPR += NumRegs;
- }
- }
- }
+ if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+ {ProgInfo.NumSGPR,
+ MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+ Ctx)},
+ Ctx);
+ }
+ if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- } else if (isKernel(F.getCallingConv()) &&
- MFI->getNumKernargPreloadedSGPRs()) {
- // Consider cases where the total number of UserSGPRs with trailing
- // allocated preload SGPRs, is greater than the number of explicitly
- // referenced SGPRs.
- const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
- CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
- ProgInfo.NumSGPR =
- AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
@@ -1168,7 +1163,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+ if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+ // LDS is allocated in 256 dword blocks.
+ LDSAlignShift = 10;
+ } else if (STM.getFeatureBits().test(
+ FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
@@ -1205,8 +1204,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
CreateExpr(STM.getWavefrontSize()), Ctx),
CreateExpr(1ULL << ScratchAlignShift));
- if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ if (STM.supportsWGP()) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ }
+
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.MemOrdered = 1;
ProgInfo.FwdProgress = 1;
}
@@ -1264,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
+ if (AMDGPU::isGFX1250(STM))
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
+
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 63589d2..9e854fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -54,6 +54,9 @@ private:
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
+ // When appropriate, add a _dvgpr$ symbol.
+ void emitDVgprSymbol(MachineFunction &MF);
+
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..d1a5b4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
++i;
}
+ if (Info->getNumKernargPreloadedSGPRs())
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ if (IsEntryFunc) {
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ }
+
FormalArgHandler Handler(B, MRI);
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
@@ -1464,9 +1476,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
- assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
- "Unexpected intrinsic");
- return lowerChainCall(MIRBuilder, Info);
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::amdgcn_cs_chain:
+ return lowerChainCall(MIRBuilder, Info);
+ case Intrinsic::amdgcn_call_whole_wave:
+ Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
+
+ // Get the callee from the original instruction, so it doesn't look like
+ // this is an indirect call.
+ Info.Callee = MachineOperand::CreateGA(
+ cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
+ Info.OrigArgs.erase(Info.OrigArgs.begin());
+ Info.IsVarArg = false;
+ break;
+ default:
+ llvm_unreachable("Unexpected intrinsic call");
+ }
}
if (Info.IsVarArg) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1fae..d14b5ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 9d6584a..04c4d00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
return false;
}
+static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
+ llvm::SelectionDAG *CurDAG,
+ const GCNSubtarget *Subtarget) {
+ if (!Subtarget->useRealTrue16Insts()) {
+ return Lo;
+ }
+
+ SDValue NewSrc;
+ SDLoc SL(Lo);
+
+ if (Lo->isDivergent()) {
+ SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SL, Lo.getValueType()),
+ 0);
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
+ CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
+ CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
+
+ NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+ Src.getValueType(), Ops),
+ 0);
+ } else {
+ // the S_MOV is needed since the Lo could still be a VGPR16.
+ // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
+ // the fixvgpr2sgprcopy pass to legalize it
+ NewSrc = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
+ 0);
+ }
+
+ return NewSrc;
+}
+
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
@@ -1162,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
+ SDVTList VTList;
unsigned Opc;
- if (Subtarget->hasMADIntraFwdBug())
- Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
- : AMDGPU::V_MAD_U64_U32_gfx11_e64;
- else
- Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ if (Subtarget->hasMadU64U32NoCarry()) {
+ VTList = CurDAG->getVTList(MVT::i64);
+ Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
+ } else {
+ VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
+ if (Subtarget->hasMADIntraFwdBug()) {
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+ : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ } else {
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ }
+ }
SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
- SDNode *Mad = CurDAG->getMachineNode(
- Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
+ SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
if (!SDValue(N, 0).use_empty()) {
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
@@ -3412,8 +3453,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
- if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+ if (VecSize == Lo.getValueSizeInBits()) {
Src = Lo;
+ } else if (VecSize == 32) {
+ Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
} else {
assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab..8ccd8fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
const GlobalValue *GV = G->getGlobal();
if (!MFI->isModuleEntryFunction()) {
+ auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
if (std::optional<uint32_t> Address =
AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+ if (IsNamedBarrier) {
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+ }
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+ } else if (IsNamedBarrier) {
+ llvm_unreachable("named barrier should have an assigned address");
}
}
@@ -1802,16 +1809,36 @@ std::pair<SDValue, SDValue>
AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
const EVT &LoVT, const EVT &HiVT,
SelectionDAG &DAG) const {
+ EVT VT = N.getValueType();
assert(LoVT.getVectorNumElements() +
(HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
- N.getValueType().getVectorNumElements() &&
+ VT.getVectorNumElements() &&
"More vector elements requested than available!");
SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
DAG.getVectorIdxConstant(0, DL));
- SDValue Hi = DAG.getNode(
- HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
- HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
- return std::pair(Lo, Hi);
+
+ unsigned LoNumElts = LoVT.getVectorNumElements();
+
+ if (HiVT.isVector()) {
+ unsigned HiNumElts = HiVT.getVectorNumElements();
+ if ((VT.getVectorNumElements() % HiNumElts) == 0) {
+ // Avoid creating an extract_subvector with an index that isn't a multiple
+ // of the result type.
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+ DAG.getConstant(LoNumElts, DL, MVT::i32));
+ return {Lo, Hi};
+ }
+
+ SmallVector<SDValue, 8> Elts;
+ DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
+ /*Count=*/HiNumElts);
+ SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
+ return {Lo, Hi};
+ }
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, HiVT, N,
+ DAG.getVectorIdxConstant(LoNumElts, DL));
+ return {Lo, Hi};
}
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
@@ -4002,7 +4029,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
- case Intrinsic::amdgcn_tanh: {
+ case Intrinsic::amdgcn_tanh:
+ case Intrinsic::amdgcn_prng_b32: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7fd131..5d31eed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_get_named_barrier_state:
return selectNamedBarrierInst(I, IntrinsicID);
case Intrinsic::amdgcn_s_get_barrier_state:
@@ -5521,11 +5523,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
Register PtrBase;
int64_t ConstOffset;
- std::tie(PtrBase, ConstOffset) =
+ bool IsInBounds;
+ std::tie(PtrBase, ConstOffset, IsInBounds) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
- !isFlatScratchBaseLegal(Root.getReg())))
+ // Adding the offset to the base address with an immediate in a FLAT
+ // instruction must not change the memory aperture in which the address falls.
+ // Therefore we can only fold offsets from inbounds GEPs into FLAT
+ // instructions.
+ if (ConstOffset == 0 ||
+ (FlatVariant == SIInstrFlags::FlatScratch &&
+ !isFlatScratchBaseLegal(Root.getReg())) ||
+ (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -5577,7 +5586,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
if (NeedIOffset &&
@@ -5760,7 +5770,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
@@ -5836,7 +5847,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
Register OrigAddr = Addr;
if (ConstOffset != 0 &&
@@ -5942,7 +5954,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
Register PtrBase;
int64_t ConstOffset;
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(VAddr, *MRI);
if (ConstOffset != 0) {
if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
@@ -6181,8 +6194,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) =
- getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
if (isDSOffsetLegal(PtrBase, Offset)) {
@@ -6243,8 +6256,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) =
- getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
int64_t OffsetValue0 = Offset;
@@ -6265,22 +6278,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
}
/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
-/// the base value with the constant offset. There may be intervening copies
-/// between \p Root and the identified constant. Returns \p Root, 0 if this does
-/// not match the pattern.
-std::pair<Register, int64_t>
+/// the base value with the constant offset, and if the offset computation is
+/// known to be inbounds. There may be intervening copies between \p Root and
+/// the identified constant. Returns \p Root, 0, false if this does not match
+/// the pattern.
+std::tuple<Register, int64_t, bool>
AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
- Register Root, const MachineRegisterInfo &MRI) const {
+ Register Root, const MachineRegisterInfo &MRI) const {
MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
- return {Root, 0};
+ return {Root, 0, false};
MachineOperand &RHS = RootI->getOperand(2);
std::optional<ValueAndVReg> MaybeOffset =
getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
if (!MaybeOffset)
- return {Root, 0};
- return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
+ return {Root, 0, false};
+ bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
+ return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
+ IsInBounds};
}
static void addZeroImm(MachineInstrBuilder &MIB) {
@@ -6358,7 +6374,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Src, *MRI);
if (isUInt<32>(Offset)) {
Data.N0 = PtrBase;
Data.Offset = Offset;
@@ -6757,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_IMM;
};
@@ -6764,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_M0;
};
@@ -6814,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
MachineInstrBuilder MIB;
- MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
I.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c9da419..0924396 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ private:
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
@@ -295,7 +296,7 @@ private:
InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
- std::pair<Register, int64_t>
+ std::tuple<Register, int64_t, bool>
getPtrBaseWithConstantOffset(Register Root,
const MachineRegisterInfo &MRI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 523c66c..56113e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -545,7 +545,8 @@ public:
AU.addRequired<TargetPassConfig>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<UniformityInfoWrapperPass>();
- AU.setPreservesAll();
+ // Invalidates UniformityInfo
+ AU.setPreservesCFG();
}
bool runOnFunction(Function &F) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 40d960e..600a130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+// Retrieves the scalar type that's the same size as the mem desc
+static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ return std::make_pair(TypeIdx, LLT::scalar(MemSize));
+ };
+}
+
// Increase the number of vector elements to reach the next legal RegClass.
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
@@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
};
}
+// If we have a truncating store or an extending load with a data size larger
+// than 32-bits and mem location is a power of 2
+static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
+ isPowerOf2_64(MemSize);
+ };
+}
+
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
@@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::pair(0, EltTy);
})
- .minScalar(0, S32)
- .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
- .widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
- .lower();
+ .minScalar(0, S32)
+ .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
+ getScalarTypeFromMemDesc(0))
+ .widenScalarToNextPow2(0)
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
// FIXME: Unaligned accesses not lowered.
@@ -5653,7 +5673,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
ST.hasDPALU_DPP() &&
- AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
+ AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
SplitSize = 64;
if (Size == SplitSize) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 304e91e..139cad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -599,8 +599,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
IRB.SetInsertPoint(&SI);
Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName());
- for (auto *Dbg : at::getAssignmentMarkers(&SI))
- Dbg->setValue(IntV);
+ for (auto *Dbg : at::getDVRAssignmentMarkers(&SI))
+ Dbg->setRawLocation(ValueAsMetadata::get(IntV));
SI.setOperand(0, IntV);
return true;
@@ -1361,6 +1361,7 @@ public:
PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI);
PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP);
+ PtrParts visitPtrToAddrInst(PtrToAddrInst &PA);
PtrParts visitPtrToIntInst(PtrToIntInst &PI);
PtrParts visitIntToPtrInst(IntToPtrInst &IP);
PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -1954,6 +1955,21 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
return {nullptr, nullptr};
}
+PtrParts SplitPtrStructs::visitPtrToAddrInst(PtrToAddrInst &PA) {
+ Value *Ptr = PA.getPointerOperand();
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&PA);
+
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *Res = IRB.CreateIntCast(Off, PA.getType(), /*isSigned=*/false);
+ copyMetadata(Res, &PA);
+ Res->takeName(&PA);
+ SplitUsers.insert(&PA);
+ PA.replaceAllUsesWith(Res);
+ return {nullptr, nullptr};
+}
+
PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
if (!isSplitFatPtr(IP.getType()))
return {nullptr, nullptr};
@@ -2350,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F,
BufferFatPtrToStructTypeMap *TypeMap) {
bool HasFatPointers = false;
for (const BasicBlock &BB : F)
- for (const Instruction &I : BB)
+ for (const Instruction &I : BB) {
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
+ // Catch null pointer constants in loads, stores, etc.
+ for (const Value *V : I.operand_values())
+ HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType()));
+ }
return HasFatPointers;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 6390853..6b3cdf5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
return GOCS(".num_agpr");
case RIK_NumSGPR:
return GOCS(".numbered_sgpr");
+ case RIK_NumNamedBarrier:
+ return GOCS(".num_named_barrier");
case RIK_PrivateSegSize:
return GOCS(".private_seg_size");
case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+ MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
assignMaxRegSym(MaxVGPRSym, MaxVGPR);
assignMaxRegSym(MaxAGPRSym, MaxAGPR);
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+ assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
}
void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
}
+MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
+ return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
+}
+
// Tries to flatten recursive call register resource gathering. Simple cycle
// avoiding dfs to find the constants in the propagated symbols.
// Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
case RIK_NumAGPR:
ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
break;
+ case RIK_NumNamedBarrier:
+ ArgExprs.push_back(MCSymbolRefExpr::create(
+ getMaxNamedBarrierSymbol(OutContext), OutContext));
+ break;
}
}
}
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
bool IsLocal = MF.getFunction().hasLocalLinkage();
+ MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
addMaxVGPRCandidate(FRI.NumVGPR);
addMaxAGPRCandidate(FRI.NumAGPR);
addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+ addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
}
const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+ SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 297e93b..b605516 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -31,6 +31,7 @@ public:
RIK_NumVGPR,
RIK_NumAGPR,
RIK_NumSGPR,
+ RIK_NumNamedBarrier,
RIK_PrivateSegSize,
RIK_UsesVCC,
RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ private:
int32_t MaxVGPR = 0;
int32_t MaxAGPR = 0;
int32_t MaxSGPR = 0;
+ int32_t MaxNamedBarrier = 0;
// Whether the MCResourceInfo has been finalized through finalize(MCContext
// &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ public:
void addMaxSGPRCandidate(int32_t candidate) {
MaxSGPR = std::max(MaxSGPR, candidate);
}
+ void addMaxNamedBarrierCandidate(int32_t candidate) {
+ MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
+ }
MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ public:
MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+ MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
/// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
/// granularity. However, some resource info has to be assigned the call
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c82cace..664a15c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
if (!BarAddr)
llvm_unreachable("named barrier should have an assigned address");
Entry.first->second = BarAddr.value();
+ unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+ recordNumNamedBarriers(BarAddr.value(), BarCnt);
return BarAddr.value();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e4..fc64e16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -49,6 +49,8 @@ protected:
// Flag to check dynamic LDS usage by kernel.
bool UsesDynamicLDS = false;
+ uint32_t NumNamedBarriers = 0;
+
// Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ public:
return GDSSize;
}
+ void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
+ NumNamedBarriers =
+ std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+ }
+ uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
+
bool isEntryFunction() const {
return IsEntryFunction;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index aa72c3e..dfe7c53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_barrier_wait:
+ case Intrinsic::amdgcn_s_barrier_leave:
case Intrinsic::amdgcn_s_get_barrier_state:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
@@ -381,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
AAResults *AA) {
MemorySSAWalker *Walker = MSSA->getWalker();
SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
- SmallSet<MemoryAccess *, 8> Visited;
+ SmallPtrSet<MemoryAccess *, 8> Visited;
MemoryLocation Loc(MemoryLocation::get(Load));
LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d92..6ddfa38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 3a37518..28d5400 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -134,8 +134,8 @@ static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
- SmallSet<const Value *, 32> WorkSet;
- SmallSet<const Value *, 32> Visited;
+ SmallPtrSet<const Value *, 32> WorkSet;
+ SmallPtrSet<const Value *, 32> Visited;
if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
if (isGlobalAddr(MO))
WorkSet.insert(MO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 4009451..90c4f4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
TRI(*ST.getRegisterInfo()) {}
bool AMDGPUPreloadKernArgProlog::run() {
- if (!ST.hasKernargPreload())
+ if (!ST.needsKernArgPreloadProlog())
return false;
unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index 984c1ee..a386fe6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> KernargPreloadCount(
"amdgpu-kernarg-preload-count",
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
+static cl::opt<bool>
+ EnableKernargPreload("amdgpu-kernarg-preload",
+ cl::desc("Enable preload kernel arguments to SGPRs"),
+ cl::init(true));
+
namespace {
class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
@@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
: ModulePass(ID), TM(TM) {}
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
+ if (!EnableKernargPreload)
+ return false;
+
SmallVector<Function *, 4> FunctionsToErase;
bool Changed = false;
for (auto &F : M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5a6ad40..8c56c21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
addRulesForGOpcs({G_PTR_ADD})
- .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
- .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
- .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
+ .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
+ .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
+ .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
+ .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 868b1a2..2379296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
return;
+ case Intrinsic::amdgcn_s_barrier_join:
+ constrainOpWithReadfirstlane(B, MI, 1);
+ return;
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_s_sleep_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index e2e5c57..d2ec7dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
// Delete FeatureWavefrontSize32 functions for
// gfx9 and below targets that don't support the mode.
- // gfx10+ is implied to support both wave32 and 64 features.
+ // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features.
// They are not in the feature set. So, we need a separate check
- if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
- ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+ if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
return true;
}
+ // gfx125x only support FeatureWavefrontSize32.
+ if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+ reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64);
+ return true;
+ }
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 8101c68..0ea9add 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
MRI.isLiveIn(MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
+ Info.NumNamedBarrier = MFI->getNumNamedBarriers();
+
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
// instructions aren't used to access the scratch buffer. Inline assembly may
// need it though.
@@ -241,6 +243,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
if (!RC || !TRI.isVGPRClass(RC))
continue;
+ if (MI.isCall() || MI.isMetaInstruction())
+ continue;
+
unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index acfff96..9ae3bb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -35,6 +35,7 @@ public:
int32_t NumVGPR = 0;
int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
+ int32_t NumNamedBarrier = 0;
uint64_t CalleeSegmentSize = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index f580f43..20b5fd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -57,27 +57,47 @@ public:
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
LIS(LIS) {}
+ // TODO: Remove this restriction
+ bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const {
+ const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2);
+ const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+ return Src2->getReg() == Dst->getReg() &&
+ Src2->getSubReg() == Dst->getSubReg();
+ }
+
+ bool isRewriteCandidate(const MachineInstr &MI) const {
+ return TII.isMAI(MI) &&
+ AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 &&
+ mfmaHasSameSrc2AndDstReg(MI);
+ }
+
/// Compute the register class constraints based on the uses of \p Reg,
- /// excluding uses from \p ExceptMI. This should be nearly identical to
+ /// excluding MFMA uses from which can be rewritten to change the register
+ /// class constraint. This should be nearly identical to
/// MachineRegisterInfo::recomputeRegClass.
const TargetRegisterClass *
- recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- const MachineInstr *ExceptMI) const;
+ recomputeRegClassExceptRewritable(Register Reg,
+ const TargetRegisterClass *OldRC,
+ const TargetRegisterClass *NewRC) const;
bool run(MachineFunction &MF) const;
};
const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept(
+AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const {
+ const TargetRegisterClass *NewRC) const {
// Accumulate constraints from all uses.
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
// Apply the effect of the given operand to NewRC.
MachineInstr *MI = MO.getParent();
- if (MI == ExceptMI)
+
+ // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
+ // effects of rewrite candidates. It just so happens that we can use either
+ // AGPR or VGPR in src0/src1, so don't bother checking the constraint
+ // effects of the individual operands.
+ if (isRewriteCandidate(*MI))
continue;
unsigned OpNo = &MO - &MI->getOperand(0);
@@ -96,8 +116,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
return false;
// Early exit if no AGPRs were assigned.
- if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+ if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
+ LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
return false;
+ }
bool MadeChange = false;
@@ -109,17 +131,25 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// Find AV_* registers assigned to AGPRs.
const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
- if (!TRI.isVectorSuperClass(VirtRegRC))
+ if (!TRI.hasAGPRs(VirtRegRC))
continue;
- const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
- if (!TRI.isAGPRClass(AssignedRC))
- continue;
+ const TargetRegisterClass *AssignedRC = VirtRegRC;
+ if (TRI.hasVGPRs(VirtRegRC)) {
+ // If this is an AV register, we have to check if the actual assignment is
+ // to an AGPR
+ AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+ if (!TRI.isAGPRClass(AssignedRC))
+ continue;
+ }
LiveInterval &LI = LIS.getInterval(VReg);
// TODO: Test multiple uses
for (VNInfo *VNI : LI.vnis()) {
+ if (VNI->isPHIDef() || VNI->isUnused())
+ continue;
+
MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
// TODO: Handle SplitKit produced copy bundles for partially defined
@@ -183,10 +213,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// first place, as well as need to assign another register, and need to
// figure out where to put them. The live range splitting is smarter than
// anything we're doing here, so trust it did something reasonable.
- const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept(
- Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI);
- if (!Src2ExceptRC)
+ const TargetRegisterClass *Src2ExceptRC =
+ recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC,
+ VirtRegRC);
+ if (!Src2ExceptRC) {
+ LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n");
continue;
+ }
const TargetRegisterClass *NewSrc2ConstraintRC =
TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF);
@@ -196,8 +229,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
const TargetRegisterClass *NewSrc2RC =
TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
if (!NewSrc2RC) {
- // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA
- // using a rewritable MFMA can be rewritten as a pair.
LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI)
<< " are incompatible with replacement class\n");
continue;
@@ -208,8 +239,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
CopySrcMI->setDesc(TII.get(AGPROp));
- // TODO: Is replacing too aggressive, fixup these instructions only?
- MRI.replaceRegWith(CopySrcReg, VReg);
+ // Perform replacement of the register, rewriting the rewritable uses.
+ for (MachineInstr &UseMI :
+ make_early_inc_range(MRI.reg_instructions(CopySrcReg))) {
+ if (TII.isMAI(UseMI)) {
+ // Note the register we need to rewrite may still appear in src0/src1,
+ // but that's fine since those can use A or V anyway.
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode());
+ if (ReplacementOp != -1)
+ UseMI.setDesc(TII.get(ReplacementOp));
+ }
+
+ UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI);
+ }
LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 10b8606..7be1899 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -378,6 +378,7 @@ foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_dead>;
+def : SourceOfDivergence<int_amdgcn_call_whole_wave>;
class AlwaysUniform<Intrinsic intr> {
Intrinsic Intr = intr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
index b60ded3..56aa3f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) {
// Lower the priority on edges where control leaves blocks from which
// the VMEM loads are reachable.
- SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+ SmallPtrSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
for (MachineBasicBlock &MBB : MF) {
if (MBBInfos[&MBB].MayReachVMEMLoad) {
if (MBB.succ_empty())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f1703..e393aa19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (Level == OptimizationLevel::O0)
return;
- PM.addPass(AMDGPUUnifyMetadataPass());
-
// We don't want to run internalization at per-module stage.
if (InternalizeSymbols && !isLTOPreLink(Phase)) {
PM.addPass(InternalizePass(mustPreserveGV));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
deleted file mode 100644
index e400491..0000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This pass that unifies multiple OpenCL metadata due to linking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
- namespace kOCLMD {
-
- const char SpirVer[] = "opencl.spir.version";
- const char OCLVer[] = "opencl.ocl.version";
- const char UsedExt[] = "opencl.used.extensions";
- const char UsedOptCoreFeat[] = "opencl.used.optional.core.features";
- const char CompilerOptions[] = "opencl.compiler.options";
- const char LLVMIdent[] = "llvm.ident";
-
- } // end namespace kOCLMD
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a pair of
- /// integer constant, e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = {i32 1, i32 2}
- /// !n2 = {i32 2, i32 0}
- /// Keep the largest version as the sole operand if PickFirst is false.
- /// Otherwise pick it from the first value, representing kernel module.
- bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() <= 1)
- return false;
- MDNode *MaxMD = nullptr;
- auto MaxVer = 0U;
- for (auto *VersionMD : NamedMD->operands()) {
- assert(VersionMD->getNumOperands() == 2);
- auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
- auto VersionMajor = CMajor->getZExtValue();
- auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
- auto VersionMinor = CMinor->getZExtValue();
- auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
- if (Ver > MaxVer) {
- MaxVer = Ver;
- MaxMD = VersionMD;
- }
- if (PickFirst)
- break;
- }
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- NamedMD->addOperand(MaxMD);
- return true;
- }
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a list e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
- /// !n2 = !{!"cl_khr_image"}
- /// Combine it into a single list with unique operands.
- bool unifyExtensionMD(Module &M, StringRef Name) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() == 1)
- return false;
-
- SmallVector<Metadata *, 4> All;
- for (auto *MD : NamedMD->operands())
- for (const auto &Op : MD->operands())
- if (!llvm::is_contained(All, Op.get()))
- All.push_back(Op.get());
-
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- for (const auto &MD : All)
- NamedMD->addOperand(MDNode::get(M.getContext(), MD));
-
- return true;
- }
-
- /// Unify multiple OpenCL metadata due to linking.
- bool unifyMetadataImpl(Module &M) {
- const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
- const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
- kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-
- bool Changed = false;
-
- for (auto &I : Vers)
- Changed |= unifyVersionMD(M, I, true);
-
- for (auto &I : Exts)
- Changed |= unifyExtensionMD(M, I);
-
- return Changed;
- }
-
- } // end anonymous namespace
-
- PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- return unifyMetadataImpl(M) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
- }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0d2feeb..9514732 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
if (DppCtrlIdx >= 0) {
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
- AMDGPU::isDPALU_DPP(MII.get(Opc))) {
- // DP ALU DPP is supported for row_newbcast only on GFX9*
+ if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) &&
+ AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) {
+ // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share
+ // only on GFX12.
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
- Error(S, "DP ALU dpp only supports row_newbcast");
+ Error(S, isGFX12() ? "DP ALU dpp only supports row_share"
+ : "DP ALU dpp only supports row_newbcast");
return false;
}
}
@@ -6268,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
ExprVal, ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
- if (IVersion.Major < 10)
- return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+ if (!supportsWGP(getSTI()))
+ return Error(IDRange.Start,
+ "directive unsupported on " + getSTI().getCPU(), IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
ValRange);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9c..dc9dd22 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetTransformInfo.cpp
AMDGPUWaitSGPRHazards.cpp
AMDGPUUnifyDivergentExitNodes.cpp
- AMDGPUUnifyMetadata.cpp
R600MachineCFGStructurizer.cpp
GCNCreateVOPD.cpp
GCNDPPCombine.cpp
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634..070de00 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
"must be zero on gfx10 or gfx11");
}
- // Bits [14-30].
- CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
+ // Bits [14-16]
+ if (isGFX1250()) {
+ PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
+ COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
+ } else {
+ CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
+ "COMPUTE_PGM_RSRC3",
+ "must be zero on gfx10+");
+ }
+
+ // Bits [17-30].
+ CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
"COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
// Bits [31].
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d5d1074..f5d4384 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1274,7 +1274,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
}
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index f9a907a..184929a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
DPPInst.addImm(ByteSelOpr->getImm());
}
+ if (MachineOperand *BitOp3 =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) {
+ assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3));
+ DPPInst.add(*BitOp3);
+ }
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
@@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
- if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
- MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
- auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
- assert(DppCtrl && DppCtrl->isImm());
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
+ auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+ assert(DppCtrl && DppCtrl->isImm());
+ unsigned DppCtrlVal = DppCtrl->getImm();
+ if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) {
+ if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) {
+ LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n");
+ // Split it.
+ return false;
+ }
+ if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) {
LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
" control value\n");
// Let it split, then control may become legal.
@@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
+ if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) &&
+ AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) {
+ LLVM_DEBUG(dbgs() << " " << OrigMI
+ << " failed: DPP ALU DPP is not supported\n");
+ break;
+ }
+
+ if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) &&
+ AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) {
+ LLVM_DEBUG(dbgs() << " " << OrigMI
+ << " failed: not valid 64-bit DPP control value\n");
+ break;
+ }
+
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 96cb5ae..a3b64ae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1200,6 +1200,14 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
+ if (ST.requiresWaitIdleBeforeGetReg())
+ fixGetRegWaitIdle(MI);
+ if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
+ fixDsAtomicAsyncBarrierArriveB64(MI);
+ if (ST.hasScratchBaseForwardingHazard())
+ fixScratchBaseForwardingHazard(MI);
+ if (ST.setRegModeNeedsVNOPs())
+ fixSetRegMode(MI);
}
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -1350,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
return (Decoded.DsCnt == 0);
}
default:
+ assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
+ MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
+ "unexpected wait count instruction");
// SOPP instructions cannot mitigate the hazard.
if (TII->isSOPP(MI))
return false;
@@ -1731,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0x0fff);
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
return true;
}
@@ -1781,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- I.getOperand(0).getImm() == 0x0fff))
+ AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
return HazardExpired;
// Track registers writes
@@ -2239,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
return true;
- switch (MI.getOpcode()) {
- case AMDGPU::S_WAITCNT:
- case AMDGPU::S_WAITCNT_VSCNT:
- case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_EXPCNT:
- case AMDGPU::S_WAITCNT_LGKMCNT:
- case AMDGPU::S_WAIT_IDLE:
- return true;
- default:
- break;
- }
-
- return false;
+ return SIInstrInfo::isWaitcnt(MI.getOpcode());
};
return FPAtomicToDenormModeWaitStates -
@@ -3428,3 +3427,125 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
return true;
}
+
+bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
+ if (!isSGetReg(MI->getOpcode()))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ switch (getHWReg(TII, *MI)) {
+ default:
+ return false;
+ case AMDGPU::Hwreg::ID_STATUS:
+ case AMDGPU::Hwreg::ID_STATE_PRIV:
+ case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
+ case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
+ break;
+ }
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xFFE3);
+ BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xFFE3);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
+ // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
+ // for hazard to trigger.
+ if (!IsHazardRecognizerMode)
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
+ const int FlatScrBaseWaitStates = 10;
+
+ bool ReadsFlatScrLo =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
+ bool ReadsFlatScrHi =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
+ if (isSGetReg(MI->getOpcode())) {
+ switch (getHWReg(TII, *MI)) {
+ default:
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
+ ReadsFlatScrLo = true;
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
+ ReadsFlatScrHi = true;
+ break;
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IsRegDefHazard = [&](Register Reg) -> bool {
+ DenseSet<const MachineBasicBlock *> Visited;
+ auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
+ return MI.modifiesRegister(Reg, TRI);
+ };
+
+ // This literally abuses the idea of waitstates. Instead of waitstates it
+ // returns 1 for SGPR written and 0 otherwise.
+ auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
+ if (!TII->isSALU(MI) && !TII->isVALU(MI))
+ return 0;
+ for (const MachineOperand &MO : MI.all_defs()) {
+ if (TRI->isSGPRReg(MRI, MO.getReg()))
+ return 1;
+ }
+ return 0;
+ };
+
+ auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
+ unsigned Wait = MI.getOperand(0).getImm();
+ if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 &&
+ AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0)
+ return true;
+ }
+ return SgprWrites >= FlatScrBaseWaitStates;
+ };
+
+ return ::getWaitStatesSince(
+ IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
+ 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
+ };
+
+ if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
+ !IsRegDefHazard(AMDGPU::SGPR102)) &&
+ (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
+ !IsRegDefHazard(AMDGPU::SGPR103)))
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
+ AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ return true;
+}
+
+bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
+ if (!isSSetReg(MI->getOpcode()) ||
+ MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f796eeae..67beffa 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -110,6 +110,10 @@ private:
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
+ bool fixGetRegWaitIdle(MachineInstr *MI);
+ bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
+ bool fixScratchBaseForwardingHazard(MachineInstr *MI);
+ bool fixSetRegMode(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
+ setTarget(NumSGPRs, NumVGPRs);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
- MaxUnifiedVGPRs =
- ST.hasGFX90AInsts()
- ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
- : 0;
+ if (UnifiedRF) {
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxUnifiedVGPRs =
+ std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+ } else {
+ MaxUnifiedVGPRs = 0;
+ }
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- return isVGPRBankSaveBeneficial(NumVGPRs);
+ // The addressable limit must always be respected.
+ if (NumVGPRs > MaxVGPRs)
+ return true;
+ // For unified RFs, combined VGPR usage limit must be respected as well.
+ return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
}
bool GCNRPTarget::satisfied() const {
- if (RP.getSGPRNum() > MaxSGPRs)
+ if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
- (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+ if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
- return satisfiesUnifiedTarget();
+ return true;
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
- GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings = false);
+ GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
+
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
const GCNRegPressure &getCurrentRP() const { return RP; }
@@ -207,7 +209,7 @@ public:
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
- } else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif
private:
+ const MachineFunction &MF;
+ const bool UnifiedRF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -246,29 +248,10 @@ private:
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
- /// Whether we consider that the register allocator will be able to swap
- /// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
- /// savings in the other VGPR bank.
- bool CombineVGPRSavings;
-
- inline bool satisifiesVGPRBanksTarget() const {
- assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
- }
-
- /// Always satisified when the subtarget doesn't have a unified RF.
- inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
- }
-
- inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
- return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
- (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
- }
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
+ GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+ : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+ RP(RP) {}
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}
/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing (original was "
+ << DAG.MinOccupancy;
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << ")\n";
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
- const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
- const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
- GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+ auto ResetTargetRegions = [&]() {
+ OptRegions.clear();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
}
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- if (OptRegions.empty())
- return false;
+ };
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ ResetTargetRegions();
+ if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ // In addition to register usage being above addressable limits, occupancy
+ // below the minimum is considered like "spilling" as well.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy targets" for all regions.
+ TargetOcc = DAG.MinOccupancy + 1;
+ unsigned VGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+ MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+ ResetTargetRegions();
}
-#endif
-
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU();
+ } else if (!TargetOcc) {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ')';
+ } else {
+ dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+ << TargetOcc;
+ }
+ dbgs() << '\n';
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
+ << '\n';
+ }
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
- /// Whether the stage is attempting to increase occupancy in the abscence of
- /// spilling.
- bool IncreaseOccupancy;
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5..2a8385d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ public:
/// the original value.
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
- bool supportsWGP() const { return getGeneration() >= GFX10; }
+ bool supportsWGP() const {
+ if (GFX1250Insts)
+ return false;
+ return getGeneration() >= GFX10;
+ }
bool hasIntClamp() const {
return HasIntClamp;
@@ -1341,6 +1345,10 @@ public:
bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+ bool setRegModeNeedsVNOPs() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
+
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1573,6 +1581,12 @@ public:
// extended VA to 57 bits.
bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+ // \returns true if the target needs to create a prolog for backward
+ // compatibility when preloading kernel arguments.
+ bool needsKernArgPreloadProlog() const {
+ return hasKernargPreload() && !GFX1250Insts;
+ }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1722,6 +1736,10 @@ public:
/// unit requirement.
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+ bool supportsWave32() const { return getGeneration() >= GFX10; }
+
+ bool supportsWave64() const { return !hasGFX1250Insts(); }
+
bool isWave32() const {
return getWavefrontSize() == 32;
}
@@ -1785,11 +1803,11 @@ public:
// \returns true if the subtarget has a hazard requiring an "s_nop 0"
// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
- bool requiresNopBeforeDeallocVGPRs() const {
- // Currently all targets that support the dealloc VGPRs message also require
- // the nop.
- return true;
- }
+ bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+
+ // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
+ // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
+ bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
bool isDynamicVGPREnabled() const { return DynamicVGPR; }
unsigned getDynamicVGPRBlockSize() const {
@@ -1801,6 +1819,18 @@ public:
// to the same register.
return false;
}
+
+ // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
+ // and surronded by S_WAIT_ALU(0xFFE3).
+ bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
+ return getGeneration() == GFX12;
+ }
+
+ // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+ // read.
+ bool hasScratchBaseForwardingHazard() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ee8683a..aafbdc2 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
unsigned Imm = MI->getOperand(OpNo).getImm();
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) {
- O << " /* DP ALU dpp only supports row_newbcast */";
+ if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) &&
+ AMDGPU::isDPALU_DPP(Desc, STI)) {
+ O << " /* DP ALU dpp only supports "
+ << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */";
return;
}
if (Imm <= DppCtrl::QUAD_PERM_LAST) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f358084..61f6732 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
// Matrix B format operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
+ // Matrix B scale operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) &&
// Matrix B reuse operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0..197de12 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -26,7 +26,6 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -277,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) {
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
@@ -291,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
PRINT_RES_INFO(NumVGPR);
PRINT_RES_INFO(NumAGPR);
PRINT_RES_INFO(NumExplicitSGPR);
+ PRINT_RES_INFO(NumNamedBarrier);
PRINT_RES_INFO(PrivateSegmentSize);
PRINT_RES_INFO(UsesVCC);
PRINT_RES_INFO(UsesFlatScratch);
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
- if (IVersion.Major >= 10) {
+ if (AMDGPU::supportsWGP(STI))
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
".amdhsa_workgroup_processor_mode");
+ if (IVersion.Major >= 10) {
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
@@ -885,7 +886,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
if (!SymbolELF->isBindingSet())
SymbolELF->setBinding(ELF::STB_GLOBAL);
- if (SymbolELF->declareCommon(Size, Alignment, true)) {
+ if (SymbolELF->declareCommon(Size, Alignment)) {
report_fatal_error("Symbol: " + Symbol->getName() +
" redeclared as different type");
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9c49020..22afcde 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -62,10 +62,10 @@ public:
virtual void EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) {};
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {};
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
@@ -141,14 +141,12 @@ public:
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
- void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR,
- const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC,
- const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack,
- const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) override;
+ void EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
const MCSymbol *MaxSGPR) override;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 2d0102f..7c01903 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -197,7 +197,7 @@ enum ClassFlags : unsigned {
namespace AMDGPU {
enum OperandType : unsigned {
- /// Operands with register or 32-bit immediate
+ /// Operands with register, 32-bit, or 64-bit immediate
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
OPERAND_REG_IMM_INT64,
OPERAND_REG_IMM_INT16,
@@ -407,7 +407,7 @@ enum CPol {
SCAL = 1 << 11, // Scale offset bit
- ALL = TH | SCOPE,
+ ALL = TH | SCOPE | NV,
// Helper bits
TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy
@@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
+ ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250
ID_GET_DDID = 11, // added in GFX10, removed in GFX11
ID_SYSMSG = 15,
@@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
+ ID_IB_STS2 = 28,
ID_SHADER_CYCLES = 29,
ID_SHADER_CYCLES_HI = 30,
ID_DVGPR_ALLOC_LO = 31,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f018f77..dce4e6f 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
// List of instructions marked for deletion.
- SmallSet<MachineInstr*, 8> MergedInstrs;
+ SmallPtrSet<MachineInstr *, 8> MergedInstrs;
bool Changed = false;
@@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
- SmallSet<const MachineInstr *, 4> Visited;
+ SmallPtrSet<const MachineInstr *, 4> Visited;
SetVector<MachineInstr *> PHIOperands;
worklist.insert(&MI);
Visited.insert(&MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb..561019b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ } else if (Info->getNumKernargPreloadedSGPRs()) {
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
}
SmallVector<SDValue, 16> Chains;
@@ -6612,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
ST->hasDPALU_DPP() &&
- AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
+ AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
SplitSize = 64;
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
@@ -10816,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain),
0);
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var: {
// these two intrinsics have two operands: barrier pointer and member count
SDValue Chain = Op->getOperand(0);
@@ -10823,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue BarOp = Op->getOperand(2);
SDValue CntOp = Op->getOperand(3);
SDValue M0Val;
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
// extract the BarrierID from bits 4-9 of BarOp
SDValue BarID;
BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -10846,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
- auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
- Op->getVTList(), Ops);
+ auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
+ case Intrinsic::amdgcn_s_barrier_join: {
+ // these three intrinsics have one operand: barrier pointer
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ unsigned Opc;
+
+ if (isa<ConstantSDNode>(BarOp)) {
+ uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+
+ // extract the BarrierID from bits 4-9 of the immediate
+ unsigned BarID = (BarVal >> 4) & 0x3F;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+ Ops.push_back(K);
+ Ops.push_back(Chain);
+ } else {
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+
+ // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
+ SDValue M0Val;
+ M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+ DAG.getShiftAmountConstant(4, MVT::i32, DL));
+ M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ }
+
+ auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
case Intrinsic::amdgcn_s_prefetch_data: {
@@ -11495,9 +11540,22 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
return FastLowered;
SDLoc SL(Op);
+ EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+
+ if (VT == MVT::bf16) {
+ SDValue ExtDiv =
+ DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
+ DAG.getTargetConstant(0, SL, MVT::i32));
+ }
+
+ assert(VT == MVT::f16);
+
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
@@ -11514,9 +11572,6 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
// We will use ISD::FMA on targets that don't support ISD::FMAD.
unsigned FMADOpCode =
isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
-
- SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
- SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
SDValue Rcp =
DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
@@ -15684,7 +15739,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
EVT VT = N->getValueType(0);
- if (VT != MVT::f16 || !Subtarget->has16BitInsts())
+ if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
return SDValue();
SDValue LHS = N->getOperand(0);
@@ -16849,6 +16904,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
+ // Check if we cannot determine the bit size of the given value type. This
+ // can happen, for example, in this situation where we have an empty struct
+ // (size 0): `call void asm "", "v"({} poison)`-
+ if (VT == MVT::Other)
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
const unsigned BitWidth = VT.getSizeInBits();
switch (Constraint[0]) {
default:
@@ -16897,13 +16957,26 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
}
break;
}
- // We actually support i128, i16 and f16 as inline parameters
- // even if they are not reported as legal
- if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
- VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
- return std::pair(0U, RC);
+ } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
+ const unsigned BitWidth = VT.getSizeInBits();
+ switch (BitWidth) {
+ case 16:
+ RC = &AMDGPU::AV_32RegClass;
+ break;
+ default:
+ RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
+ if (!RC)
+ return std::pair(0U, nullptr);
+ break;
+ }
}
+ // We actually support i128, i16 and f16 as inline parameters
+ // even if they are not reported as legal
+ if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+ VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+ return std::pair(0U, RC);
+
auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
if (Kind != '\0') {
if (Kind == 'v') {
@@ -16916,7 +16989,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
if (RC) {
if (NumRegs > 1) {
- if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs())
+ if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
return std::pair(0U, nullptr);
uint32_t Width = NumRegs * 32;
@@ -16988,6 +17061,9 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
case 'a':
return C_RegisterClass;
}
+ } else if (Constraint.size() == 2) {
+ if (Constraint == "VA")
+ return C_RegisterClass;
}
if (isImmConstraint(Constraint)) {
return C_Other;
@@ -17727,23 +17803,9 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
/// Return if a flat address space atomicrmw can access private memory.
static bool flatInstrMayAccessPrivate(const Instruction *I) {
- const MDNode *NoaliasAddrSpaceMD =
- I->getMetadata(LLVMContext::MD_noalias_addrspace);
- if (!NoaliasAddrSpaceMD)
- return true;
-
- for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
- ++I) {
- auto *Low = mdconst::extract<ConstantInt>(
- NoaliasAddrSpaceMD->getOperand(2 * I + 0));
- if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
- auto *High = mdconst::extract<ConstantInt>(
- NoaliasAddrSpaceMD->getOperand(2 * I + 1));
- return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
- }
- }
-
- return true;
+ const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
+ return !MD ||
+ !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
TargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4b48fc4..343e455 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+ case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 89d9b0d..50964a9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 {
let Inst{4} = r128;
let Inst{5} = d16;
let Inst{6} = a16;
+ let Inst{7} = cpol{5}; // nv
let Inst{21-14} = op;
let Inst{25-22} = dmask;
let Inst{39-32} = vdata;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 19e6bcf..cc4bee0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
- if (ST.hasMovB64() &&
+ if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
AMDGPU::isLegalDPALU_DPPControl(
- getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
+ ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
return std::pair(&MI, nullptr);
}
@@ -2905,7 +2905,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineBasicBlock &RestoreBB,
const DebugLoc &DL, int64_t BrOffset,
RegScavenger *RS) const {
- assert(RS && "RegScavenger required for long branching");
assert(MBB.empty() &&
"new block should be inserted for expanding unconditional branch");
assert(MBB.pred_size() == 1);
@@ -4241,6 +4240,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
MI.getOpcode() == AMDGPU::S_SETPRIO ||
+ MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
changesVGPRIndexingMode(MI);
}
@@ -4267,12 +4267,15 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
if (MI.memoperands_empty())
return true;
- // TODO (?): Does this need to be taught how to read noalias.addrspace ?
-
// See if any memory operand specifies an address space that involves scratch.
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
+ return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
+ *MD, AMDGPUAS::PRIVATE_ADDRESS);
+ }
+ return AS == AMDGPUAS::PRIVATE_ADDRESS;
});
}
@@ -5433,7 +5436,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
- !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
+ !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
+ AMDGPU::isDPALU_DPP(Desc, ST)) {
ErrInfo = "Invalid dpp_ctrl value: "
"DP ALU dpp only support row_newbcast";
return false;
@@ -9225,7 +9229,7 @@ bool SIInstrInfo::isHighLatencyDef(int Opc) const {
(isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
}
-unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
int &FrameIndex) const {
const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
if (!Addr || !Addr->isFI())
@@ -9238,7 +9242,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
}
-unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
int &FrameIndex) const {
const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
assert(Addr && Addr->isFI());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6b9403f..12ffae7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -996,6 +996,11 @@ public:
bool isBarrier(unsigned Opcode) const {
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+ Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
+ Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
+ Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
+ Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
}
@@ -1051,7 +1056,7 @@ public:
}
}
- bool isWaitcnt(unsigned Opcode) const {
+ static bool isWaitcnt(unsigned Opcode) {
switch (getNonSoftWaitcntOpcode(Opcode)) {
case AMDGPU::S_WAITCNT:
case AMDGPU::S_WAITCNT_VSCNT:
@@ -1402,8 +1407,8 @@ public:
return get(pseudoToMCOpcode(Opcode));
}
- unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
- unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
Register isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c552f1a..c425d97 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
+ !eq(VT, f64) : VCSrc_f64,
!eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}
@@ -2707,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
isModifierType<Src2VT>.ret,
HasOMod);
field bit HasNeg = HasModifiers;
- field bit HasMatrixReuse = 0;
field bit HasMatrixFMT = 0;
field bit HasMatrixScale = 0;
field bit HasMatrixReuse = 0;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b49c5a9..e204d6b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -87,6 +87,8 @@ enum InstClassEnum {
GLOBAL_STORE_SADDR,
FLAT_LOAD,
FLAT_STORE,
+ FLAT_LOAD_SADDR,
+ FLAT_STORE_SADDR,
GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
GLOBAL_STORE // any CombineInfo, they are only ever returned by
// getCommonInstClass.
@@ -354,6 +356,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
case AMDGPU::FLAT_LOAD_DWORD:
case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
@@ -367,6 +371,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
@@ -380,6 +386,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX3:
case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -393,6 +401,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX4:
case AMDGPU::FLAT_STORE_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
@@ -575,6 +585,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
return GLOBAL_STORE_SADDR;
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ return FLAT_LOAD_SADDR;
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+ return FLAT_STORE_SADDR;
}
}
@@ -661,6 +681,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ return AMDGPU::FLAT_LOAD_DWORD_SADDR;
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+ return AMDGPU::FLAT_STORE_DWORD_SADDR;
}
}
@@ -776,6 +806,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
Result.SAddr = true;
[[fallthrough]];
case AMDGPU::GLOBAL_LOAD_DWORD:
@@ -1875,6 +1913,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 4:
return AMDGPU::FLAT_STORE_DWORDX4;
}
+ case FLAT_LOAD_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
+ }
+ case FLAT_STORE_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
+ }
case MIMG:
assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
"No overlaps");
@@ -2508,12 +2568,14 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
case FLAT_LOAD:
+ case FLAT_LOAD_SADDR:
case GLOBAL_LOAD:
case GLOBAL_LOAD_SADDR:
NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
case FLAT_STORE:
+ case FLAT_STORE_SADDR:
case GLOBAL_STORE:
case GLOBAL_STORE_SADDR:
NewMI = mergeFlatStorePair(CI, Paired, Where->I);
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f3..e97536d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -76,10 +77,11 @@ private:
LiveIntervals *LIS = nullptr;
LiveVariables *LV = nullptr;
MachineDominatorTree *MDT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
- SmallSet<MachineBasicBlock *, 4> KillBlocks;
+ SmallPtrSet<MachineBasicBlock *, 4> KillBlocks;
SmallSet<Register, 8> RecomputeRegs;
const TargetRegisterClass *BoolRC = nullptr;
@@ -138,8 +140,8 @@ private:
public:
SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
- MachineDominatorTree *MDT)
- : LIS(LIS), LV(LV), MDT(MDT) {}
+ MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
+ : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
bool run(MachineFunction &MF);
};
@@ -159,6 +161,7 @@ public:
AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<LiveVariablesWrapperPass>();
@@ -457,7 +460,7 @@ MachineBasicBlock::iterator
SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
- SmallSet<const MachineBasicBlock *, 4> Visited;
+ SmallPtrSet<const MachineBasicBlock *, 4> Visited;
MachineBasicBlock *B = &MBB;
do {
if (!Visited.insert(B).second)
@@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock *SplitBB = &MBB;
if (NeedBlockSplit) {
SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
- if (MDT && SplitBB != &MBB) {
- MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
- SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
- MBBNode->end());
- MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
- for (MachineDomTreeNode *Child : Children)
- MDT->changeImmediateDominator(Child, SplitBBNode);
+ if (SplitBB != &MBB && (MDT || PDT)) {
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
}
Opcode = OrTermrOpc;
InsPt = MI;
@@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
MachineBasicBlock *Succ = *MBB.succ_begin();
MachineBasicBlock *FallThrough = nullptr;
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 8> DTUpdates;
+
while (!MBB.predecessors().empty()) {
MachineBasicBlock *P = *MBB.pred_begin();
if (P->getFallThrough(false) == &MBB)
FallThrough = P;
P->ReplaceUsesOfBlockWith(&MBB, Succ);
+ DTUpdates.push_back({DomTreeT::Insert, P, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, P, &MBB});
}
MBB.removeSuccessor(Succ);
if (LIS) {
for (auto &I : MBB.instrs())
LIS->RemoveMachineInstrFromMaps(I);
}
- if (MDT) {
- // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
- // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
- // be a leaf node in MDT and could be erased directly.
- if (MDT->dominates(&MBB, Succ))
- MDT->changeImmediateDominator(MDT->getNode(Succ),
- MDT->getNode(&MBB)->getIDom());
- MDT->eraseNode(&MBB);
- }
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
+
MBB.clear();
MBB.eraseFromParent();
if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
- return SILowerControlFlow(LIS, LV, MDT).run(MF);
+ auto *PDTWrapper =
+ getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+ MachinePostDominatorTree *PDT =
+ PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
+ return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
}
PreservedAnalyses
@@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF,
LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
MachineDominatorTree *MDT =
MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+ MachinePostDominatorTree *PDT =
+ MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
- bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+ bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
if (!Changed)
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachinePostDominatorTreeAnalysis>();
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<LiveIntervalsAnalysis>();
PA.preserve<LiveVariablesAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9509199..09b737c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -209,10 +209,13 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
// So set the save points for those.
// Use the points found by shrink-wrapping, if any.
- if (MFI.getSavePoint()) {
- SaveBlocks.push_back(MFI.getSavePoint());
- assert(MFI.getRestorePoint() && "Both restore and save must be set");
- MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ if (!MFI.getSavePoints().empty()) {
+ assert(MFI.getSavePoints().size() == 1 &&
+ "Multiple save points not yet supported!");
+ SaveBlocks.push_back(MFI.getSavePoints().front());
+ assert(MFI.getRestorePoints().size() == 1 &&
+ "Multiple restore points not yet supported!");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
// If RestoreBlock does not have any successor and is not a return block
// then the end point is unreachable and we do not need to insert any
// epilogue.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..8a11203 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
// where it is better to produce the VGPR form (e.g. if there are VGPR users
// of the MFMA result).
-cl::opt<bool> MFMAVGPRForm(
+static cl::opt<bool> MFMAVGPRForm(
"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+ NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+ NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
+ uint16_t NumWaveDispatchSGPRs = 0;
+ uint16_t NumWaveDispatchVGPRs = 0;
uint32_t HighBitsOf32BitAddress = 0;
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
+ unsigned NumWaveDispatchSGPRs = 0;
+ unsigned NumWaveDispatchVGPRs = 0;
+
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
+ unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+ void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+ unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+ void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 205a45a..38d9a4b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
if (VirtReg.isPhysical())
continue;
+ if (!VirtReg.isValid())
+ continue;
+
if (!VRM->hasPhys(VirtReg))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 5940f45..93ba0a3 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
NumSGPRsForWavesPerEU = ZeroExpr;
NumVGPRsForWavesPerEU = ZeroExpr;
+ NamedBarCnt = ZeroExpr;
Occupancy = ZeroExpr;
DynamicCallStack = ZeroExpr;
VCCUsed = ZeroExpr;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 79099d2..171c4a3 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
const MCExpr *NumVGPRsForWavesPerEU = nullptr;
+ // Number of named barriers used by the kernel.
+ const MCExpr *NamedBarCnt = nullptr;
+
// Final occupancy.
const MCExpr *Occupancy = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 81655f5..0293d40 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
}
//===----------------------------------------------------------------------===//
-// SSrc_* Operands with an SGPR or a 32-bit immediate
+// SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate
+// if supported by target.
//===----------------------------------------------------------------------===//
class SrcRegOrImm9<RegisterClass regClass, string operandType>
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 431d73b..a003a46 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o
let isConvergent = 1;
}
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
} // End Uses = [M0]
def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out
let isConvergent = 1;
}
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
} // End has_sdst = 0
def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
let isConvergent = 1;
}
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let fixed_imm = 1;
+ let isConvergent = 1;
+ let Defs = [SCC];
+}
+
+def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+ (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
+
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
@@ -1630,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
-def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
+ let SubtargetPredicate = isNotGFX1250Plus;
+}
// On SI the documentation says sleep for approximately 64 * low 2
// bits, consistent with the reported maximum of 448. On VI the
@@ -2144,9 +2181,13 @@ defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>;
defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>;
defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>;
defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>;
defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>;
defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>;
defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
@@ -2639,6 +2680,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
}
defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>;
defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>;
defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 3d9455f..c740b5e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
{{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
{{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
- {{""}},
+ {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250},
{{"MSG_SYSMSG"}, ID_SYSMSG},
{{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
{{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus},
@@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{""}},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
- {{""}},
+ {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
{{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
{{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1e3e9a2..6e4e087 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -1160,17 +1161,28 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
+ if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 327680;
return 0;
}
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
- // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+ // workgroup must share".
+
+ // GFX12.5 only supports CU mode, which contains four SIMDs.
+ if (isGFX1250(*STI)) {
+ assert(STI->getFeatureBits().test(FeatureCuMode));
+ return 4;
+ }
+
+ // For gfx10 in CU mode the functional block is the CU, which contains
// two SIMDs.
if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
return 2;
- // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
- // two CUs, so a total of four SIMDs.
+
+ // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+ // contains two CUs, so a total of four SIMDs.
return 4;
}
@@ -1666,6 +1678,29 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size) {
return Vals;
}
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
+ assert((MD.getNumOperands() % 2 == 0) && "invalid number of operands!");
+ for (unsigned I = 0, E = MD.getNumOperands() / 2; I != E; ++I) {
+ auto Low =
+ mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 0))->getValue();
+ auto High =
+ mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 1))->getValue();
+ // There are two types of [A; B) ranges:
+ // A < B, e.g. [4; 5) which is a range that only includes 4.
+ // A > B, e.g. [5; 4) which is a range that wraps around and includes
+ // everything except 4.
+ if (Low.ult(High)) {
+ if (Low.ule(Val) && High.ugt(Val))
+ return true;
+ } else {
+ if (Low.uge(Val) && High.ult(Val))
+ return true;
+ }
+ }
+
+ return false;
+}
+
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
@@ -2406,7 +2441,11 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
return 0;
}
-unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; }
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return 32;
+ return 16;
+}
bool isSI(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
@@ -2478,6 +2517,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
+bool supportsWGP(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return false;
+ return isGFX10Plus(STI);
+}
+
bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
@@ -3309,13 +3354,39 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
return false;
}
-bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
+bool isDPALU_DPP32BitOpc(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_MUL_LO_U32_e64:
+ case AMDGPU::V_MUL_LO_U32_e64_dpp:
+ case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250:
+ case AMDGPU::V_MUL_HI_U32_e64:
+ case AMDGPU::V_MUL_HI_U32_e64_dpp:
+ case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250:
+ case AMDGPU::V_MUL_HI_I32_e64:
+ case AMDGPU::V_MUL_HI_I32_e64_dpp:
+ case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250:
+ case AMDGPU::V_MAD_U32_e64:
+ case AMDGPU::V_MAD_U32_e64_dpp:
+ case AMDGPU::V_MAD_U32_e64_dpp_gfx1250:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
+ if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP))
+ return false;
+
+ if (isDPALU_DPP32BitOpc(OpDesc.getOpcode()))
+ return ST.hasFeature(AMDGPU::FeatureGFX1250Insts);
+
return hasAny64BitVGPROperands(OpDesc);
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- // Currently this is 128 for all subtargets
- return 128;
+ return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+ : 128;
}
bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1bcd36c..70dfb63 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -35,6 +35,7 @@ class MCInstrInfo;
class MCRegisterClass;
class MCRegisterInfo;
class MCSubtargetInfo;
+class MDNode;
class StringRef;
class Triple;
class raw_ostream;
@@ -1064,6 +1065,9 @@ SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
std::optional<SmallVector<unsigned>>
getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
+/// Checks if \p Val is inside \p MD, a !range-like metadata.
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+
/// Represents the counter values to wait for in an s_waitcnt instruction.
///
/// Large values (including the maximum possible integer) can be used to
@@ -1549,6 +1553,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
@@ -1750,15 +1755,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
LLVM_READNONE
-inline bool isLegalDPALU_DPPControl(unsigned DC) {
- return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) {
+ if (isGFX12(ST))
+ return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST;
+ if (isGFX90A(ST))
+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+ return false;
}
/// \returns true if an instruction may have a 64-bit VGPR operand.
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
+/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands.
+bool isDPALU_DPP32BitOpc(unsigned Opc);
+
/// \returns true if an instruction is a DP ALU DPP.
-bool isDPALU_DPP(const MCInstrDesc &OpDesc);
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index b128207..11c7275 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -706,7 +706,6 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12PlusNot12_50 in
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
let SubtargetPredicate = isGFX125xOnly in
@@ -731,7 +730,6 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
>;
let OtherPredicates = [HasFP8ConversionInsts] in {
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12PlusNot12_50 in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
let SubtargetPredicate = isGFX125xOnly in {
@@ -740,7 +738,6 @@ let OtherPredicates = [HasFP8ConversionInsts] in {
def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
(V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
}
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12Plus in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}
@@ -1058,11 +1055,6 @@ multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
- string opName, string asmName> :
- VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
- VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
-
multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
defm opName#"_t16" :
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f4b6af6..329d003 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> :
multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> :
VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>;
+multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> :
+ VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>;
+
multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
string asmName> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
@@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>;
-defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>;
-defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>;
-defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>;
+defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>;
+defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>;
+defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>;
defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>;
defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
@@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in {
}
// These instructions differ from GFX12 variant by supporting DPP:
+defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>;
+defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>;
+defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>;
+
defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.h b/llvm/lib/Target/ARM/ARMCallingConv.h
index 7c692f0..b6b2d59 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -19,34 +19,35 @@ namespace llvm {
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
} // namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index ef69083..c53e215 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -109,7 +109,7 @@ namespace {
/// NewWaterList - The subset of WaterList that was created since the
/// previous iteration by inserting unconditional branches.
- SmallSet<MachineBasicBlock*, 4> NewWaterList;
+ SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
using water_iterator = std::vector<MachineBasicBlock *>::iterator;
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 7ba2487..14e1160 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -1943,8 +1943,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
unsigned &NumBytes,
bool isVarArg) {
SmallVector<CCValAssign, 16> ArgLocs;
+ SmallVector<Type *, 16> OrigTys;
+ for (Value *Arg : Args)
+ OrigTys.push_back(Arg->getType());
CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context);
- CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, OrigTys,
CCAssignFnForCall(CC, false, isVarArg));
// Check that we can handle all of the arguments. If we can't, then bail out
@@ -2093,7 +2096,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
if (RetVT != MVT::isVoid) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+ CCInfo.AnalyzeCallResult(RetVT, I->getType(),
+ CCAssignFnForCall(CC, true, isVarArg));
// Copy all of the result registers out of their specified physreg.
if (RVLocs.size() == 2 && RetVT == MVT::f64) {
@@ -2278,7 +2282,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+ CCInfo.AnalyzeCallResult(RetVT, RetTy, CCAssignFnForCall(CC, true, false));
if (RVLocs.size() >= 2 && RetVT != MVT::f64)
return false;
}
@@ -2389,7 +2393,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
RetVT != MVT::i16 && RetVT != MVT::i32) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+ CCInfo.AnalyzeCallResult(RetVT, RetTy,
+ CCAssignFnForCall(CC, true, isVarArg));
if (RVLocs.size() >= 2 && RetVT != MVT::f64)
return false;
}
@@ -2499,6 +2504,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
// Set all unused physreg defs as dead.
static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+ diagnoseDontCall(*CI);
return true;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ea99cc4..8301563 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
}
}
- // RTLIB
- if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
- TT.isTargetMuslAEABI() || TT.isAndroid())) {
- // FIXME: This does not depend on the subtarget and should go directly into
- // RuntimeLibcalls. This is only here because of missing support for setting
- // the calling convention of an implementation.
- // clang-format off
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } LibraryCalls[] = {
- // Double-precision floating-point arithmetic helper functions
- // RTABI chapter 4.1.2, Table 2
- { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
- { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
- { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
- { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
-
- // Double-precision floating-point comparison helper functions
- // RTABI chapter 4.1.2, Table 3
- { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq },
- { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une },
- { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt },
- { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple },
- { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge },
- { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt },
- { RTLIB::UO_F64, RTLIB::__aeabi_dcmpun },
-
- // Single-precision floating-point arithmetic helper functions
- // RTABI chapter 4.1.2, Table 4
- { RTLIB::ADD_F32, RTLIB::__aeabi_fadd },
- { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv },
- { RTLIB::MUL_F32, RTLIB::__aeabi_fmul },
- { RTLIB::SUB_F32, RTLIB::__aeabi_fsub },
-
- // Single-precision floating-point comparison helper functions
- // RTABI chapter 4.1.2, Table 5
- { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq },
- { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une },
- { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt},
- { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple },
- { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge },
- { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt },
- { RTLIB::UO_F32, RTLIB::__aeabi_fcmpun },
-
- // Floating-point to integer conversions.
- // RTABI chapter 4.1.2, Table 6
- { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz },
- { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz },
- { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz },
- { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz },
- { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz },
- { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz },
- { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz },
- { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz },
-
- // Conversions between floating types.
- // RTABI chapter 4.1.2, Table 7
- { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f },
- { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h },
- { RTLIB::FPEXT_F32_F64, RTLIB::__aeabi_f2d },
-
- // Integer to floating-point conversions.
- // RTABI chapter 4.1.2, Table 8
- { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d },
- { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d },
- { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d },
- { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d },
- { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f },
- { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f },
- { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f },
- { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f },
-
- // Long long helper functions
- // RTABI chapter 4.2, Table 9
- { RTLIB::MUL_I64, RTLIB::__aeabi_lmul },
- { RTLIB::SHL_I64, RTLIB::__aeabi_llsl },
- { RTLIB::SRL_I64, RTLIB::__aeabi_llsr },
- { RTLIB::SRA_I64, RTLIB::__aeabi_lasr },
-
- // Integer division functions
- // RTABI chapter 4.3.1
- { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv },
- { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod },
- { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv },
- { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod },
- };
- // clang-format on
-
- for (const auto &LC : LibraryCalls)
- setLibcallImpl(LC.Op, LC.Impl);
-
- // EABI dependent RTLIB
- if (TM.Options.EABIVersion == EABI::EABI4 ||
- TM.Options.EABIVersion == EABI::EABI5) {
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } MemOpsLibraryCalls[] = {
- // Memory operations
- // RTABI chapter 4.3.4
- {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy},
- {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove},
- {RTLIB::MEMSET, RTLIB::__aeabi_memset},
- {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4},
- {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8},
- {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4},
- {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8},
- {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4},
- {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8},
- {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr},
- {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4},
- {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8},
- };
-
- for (const auto &LC : MemOpsLibraryCalls)
- setLibcallImpl(LC.Op, LC.Impl);
- }
- }
-
- // The half <-> float conversion functions are always soft-float on
- // non-watchos platforms, but are needed for some targets which use a
- // hard-float calling convention by default.
- if (!TT.isWatchABI()) {
- if (TM.isAAPCS_ABI()) {
- setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS);
- } else {
- setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS);
- }
- }
-
- // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
- // a __gnu_ prefix (which is the default).
- if (TT.isTargetAEABI()) {
- // FIXME: This does not depend on the subtarget and should go directly into
- // RuntimeLibcalls. This is only here because of missing support for setting
- // the calling convention of an implementation.
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } LibraryCalls[] = {
- {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
- {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
- };
-
- for (const auto &LC : LibraryCalls) {
- setLibcallImpl(LC.Op, LC.Impl);
- }
- } else if (!TT.isOSBinFormatMachO()) {
- setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee);
- setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee);
- }
-
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
else
@@ -802,6 +641,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::BSWAP, VT, Expand);
}
+ if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::SCMP, MVT::i32, Custom);
+
+ if (!Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::UCMP, MVT::i32, Custom);
+
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -1634,6 +1479,10 @@ bool ARMTargetLowering::useSoftFloat() const {
return Subtarget->useSoftFloat();
}
+bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+ return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
+}
+
// FIXME: It might make sense to define the representative register class as the
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -3769,10 +3618,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
// call __tls_get_addr.
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Argument;
- Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
- Args.push_back(Entry);
+ Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
// FIXME: is there useful debug info available here?
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -7396,7 +7242,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
// If the mask is twice as long as the input vector then we need to check the
@@ -7428,7 +7274,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7531,7 +7377,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7564,7 +7410,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -9991,9 +9837,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
- ArgListEntry Entry;
- Entry.Node = SRet;
- Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+ ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
Entry.IsSExt = false;
Entry.IsZExt = false;
Entry.IsSRet = true;
@@ -10001,12 +9845,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
RetTy = Type::getVoidTy(*DAG.getContext());
}
- ArgListEntry Entry;
- Entry.Node = Arg;
- Entry.Ty = ArgTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(Arg, ArgTy);
RTLIB::Libcall LC =
(ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
@@ -10059,10 +9898,9 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
ARMTargetLowering::ArgListTy Args;
for (auto AI : {1, 0}) {
- ArgListEntry Arg;
- Arg.Node = Op.getOperand(AI);
- Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
- Args.push_back(Arg);
+ SDValue Operand = Op.getOperand(AI);
+ Args.emplace_back(Operand,
+ Operand.getValueType().getTypeForEVT(*DAG.getContext()));
}
CallLoweringInfo CLI(DAG);
@@ -10612,6 +10450,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
return DAG.getBitcast(MVT::i32, Res);
}
+SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // Determine if this is signed or unsigned comparison
+ bool IsSigned = (Op.getOpcode() == ISD::SCMP);
+
+ // Special case for Thumb1 UCMP only
+ if (!IsSigned && Subtarget->isThumb1Only()) {
+ // For Thumb unsigned comparison, use this sequence:
+ // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
+ // sbc r2, r2 ; r2 = r2 - r2 - !carry
+ // cmp r1, r0 ; compare RHS with LHS
+ // sbc r1, r1 ; r1 = r1 - r1 - !carry
+ // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
+
+ // First subtraction: LHS - RHS
+ SDValue Sub1WithFlags = DAG.getNode(
+ ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ SDValue Sub1Result = Sub1WithFlags.getValue(0);
+ SDValue Flags1 = Sub1WithFlags.getValue(1);
+
+ // SUBE: Sub1Result - Sub1Result - !carry
+ // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
+ SDValue Sbc1 =
+ DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
+ Sub1Result, Sub1Result, Flags1);
+ SDValue Sbc1Result = Sbc1.getValue(0);
+
+ // Second comparison: RHS vs LHS (reverse comparison)
+ SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
+
+ // SUBE: RHS - RHS - !carry
+ // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
+ SDValue Sbc2 = DAG.getNode(
+ ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
+ SDValue Sbc2Result = Sbc2.getValue(0);
+
+ // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
+ SDValue Result =
+ DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
+ if (Op.getValueType() != MVT::i32)
+ Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
+
+ return Result;
+ }
+
+ // For the ARM assembly pattern:
+ // subs r0, r0, r1 ; subtract RHS from LHS and set flags
+ // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
+ // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
+ // signed, LO for unsigned)
+ // ; if LHS == RHS, result remains 0 from the subs
+
+ // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+ unsigned Opcode = ARMISD::SUBC;
+
+ // Check if RHS is a subtraction against 0: (0 - X)
+ if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubLHS = RHS.getOperand(0);
+ SDValue SubRHS = RHS.getOperand(1);
+
+ // Check if it's 0 - X
+ if (isNullConstant(SubLHS)) {
+ bool CanUseAdd = false;
+ if (IsSigned) {
+ // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
+ if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
+ .getSignedMinValue()
+ .isMinSignedValue()) {
+ CanUseAdd = true;
+ }
+ } else {
+ // For UCMP: only if X is known to never be zero
+ if (DAG.isKnownNeverZero(SubRHS)) {
+ CanUseAdd = true;
+ }
+ }
+
+ if (CanUseAdd) {
+ Opcode = ARMISD::ADDC;
+ RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+ // LHS - (0 - X)
+ }
+ }
+ }
+
+ // Generate the operation with flags
+ SDValue OpWithFlags;
+ if (Opcode == ARMISD::ADDC) {
+ // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
+ OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ } else {
+ // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+ OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ }
+
+ SDValue OpResult = OpWithFlags.getValue(0); // The operation result
+ SDValue Flags = OpWithFlags.getValue(1); // The flags
+
+ // Constants for conditional moves
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
+
+ // Select condition codes based on signed vs unsigned
+ ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
+ ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
+
+ // First conditional move: if greater than, set to 1
+ SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
+ SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
+ GTCondValue, Flags);
+
+ // Second conditional move: if less than, set to -1
+ SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
+ SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
+ LTCondValue, Flags);
+
+ if (Op.getValueType() != MVT::i32)
+ Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+ return Result2;
+}
+
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -10740,6 +10705,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_BF16:
return LowerFP_TO_BF16(Op, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
+ case ISD::UCMP:
+ case ISD::SCMP:
+ return LowerCMP(Op, DAG);
}
}
@@ -20627,12 +20595,10 @@ static TargetLowering::ArgListTy getDivRemArgList(
bool isSigned = N->getOpcode() == ISD::SDIVREM ||
N->getOpcode() == ISD::SREM;
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
EVT ArgVT = N->getOperand(i).getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*Context);
- Entry.Node = N->getOperand(i);
- Entry.Ty = ArgTy;
+ TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);
@@ -21605,7 +21571,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -21615,7 +21581,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
- assert(!Mask && "Unexpected mask on a load");
+ assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d..778595e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -607,6 +607,8 @@ class VectorType;
bool preferZeroCompareBranch() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
@@ -683,8 +685,8 @@ class VectorType;
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
@@ -904,6 +906,7 @@ class VectorType;
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index b4677a8..ebfa593 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -89,19 +89,15 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
AlignVariant = ALIGN1;
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
- Entry.Node = Dst;
- Args.push_back(Entry);
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Args.emplace_back(Dst, IntPtrTy);
if (AEABILibcall == AEABI_MEMCLR) {
- Entry.Node = Size;
- Args.push_back(Entry);
+ Args.emplace_back(Size, IntPtrTy);
} else if (AEABILibcall == AEABI_MEMSET) {
// Adjust parameters for memset, EABI uses format (ptr, size, value),
// GNU library uses (ptr, value, size)
// See RTABI section 4.3.4
- Entry.Node = Size;
- Args.push_back(Entry);
+ Args.emplace_back(Size, IntPtrTy);
// Extend or truncate the argument to be an i32 value for the call.
if (Src.getValueType().bitsGT(MVT::i32))
@@ -109,16 +105,13 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
else if (Src.getValueType().bitsLT(MVT::i32))
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
- Entry.Node = Src;
- Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(Src,
+ Type::getInt32Ty(*DAG.getContext()));
Entry.IsSExt = false;
Args.push_back(Entry);
} else {
- Entry.Node = Src;
- Args.push_back(Entry);
-
- Entry.Node = Size;
- Args.push_back(Entry);
+ Args.emplace_back(Src, IntPtrTy);
+ Args.emplace_back(Size, IntPtrTy);
}
static const RTLIB::Libcall FunctionImpls[4][3] = {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6f37eca..6b28541 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(
CostKind, Op1Info, Op2Info, I);
}
-InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
- ScalarEvolution *SE,
- const SCEV *Ptr) const {
+InstructionCost
+ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -1095,7 +1096,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
int MaxMergeDistance = 64;
if (ST->hasNEON()) {
- if (Ty->isVectorTy() && SE &&
+ if (PtrTy->isVectorTy() && SE &&
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
return NumVectorInstToHideOverhead;
@@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
// addressing mode.
return 1;
}
- return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
}
bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
@@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (!Mask.empty()) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
+ // Check for LD2/LD4 instructions, which are represented in llvm IR as
+ // deinterleaving-shuffle(load). The shuffle cost could potentially be
+ // free, but we model it with a cost of LT.first so that LD2/LD4 have a
+ // higher cost than just the load.
+ if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
+ (LT.second.getScalarSizeInBits() == 8 ||
+ LT.second.getScalarSizeInBits() == 16 ||
+ LT.second.getScalarSizeInBits() == 32) &&
+ LT.second.getSizeInBits() == 128 &&
+ ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) ||
+ (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))))
+ return ST->getMVEVectorCostFactor(CostKind) *
+ std::max<InstructionCost>(1, LT.first / 4);
+
+ // Check for ST2/ST4 instructions, which are represented in llvm IR as
+ // store(interleaving-shuffle). The shuffle cost could potentially be
+ // free, but we model it with a cost of LT.first so that ST2/ST4 have a
+ // higher cost than just the store.
+ if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
+ (LT.second.getScalarSizeInBits() == 8 ||
+ LT.second.getScalarSizeInBits() == 16 ||
+ LT.second.getScalarSizeInBits() == 32) &&
+ LT.second.getSizeInBits() == 128 &&
+ ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+ ShuffleVectorInst::isInterleaveMask(
+ Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
+ (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+ ShuffleVectorInst::isInterleaveMask(
+ Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
+ return ST->getMVEVectorCostFactor(CostKind) * LT.first;
+
if (LT.second.isVector() &&
Mask.size() <= LT.second.getVectorNumElements() &&
(isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 522c235..cdd8bcb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -257,8 +257,9 @@ public:
unsigned Index, const Value *Op0,
const Value *Op1) const override;
- InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ece6c10..0e97483 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3373,12 +3373,12 @@ public:
void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+ Inst.addOperand(MCOperand::createImm(getMSRMask()));
}
void addBankedRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+ Inst.addOperand(MCOperand::createImm(getBankedReg()));
}
void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 8ee3a2d..a5266a9 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -20,7 +20,6 @@
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSymbolMachO.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 0b4e7df..5eeb4fe 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -922,7 +922,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
// the function.
unsigned LastVPTImm = 0;
Register LastVPTReg = 0;
- SmallSet<MachineInstr *, 4> DeadInstructions;
+ SmallPtrSet<MachineInstr *, 4> DeadInstructions;
for (MachineInstr &Instr : MBB.instrs()) {
// Look for predicated MVE instructions.
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 25ad9ec..545bc3a 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -505,10 +505,9 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (SDValue const &Value : Op->op_values()) {
- Entry.Node = Value;
- Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ Value, Value.getValueType().getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = IsSigned;
Entry.IsZExt = !IsSigned;
Args.push_back(Entry);
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 2ae22b2..301ce9c 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -94,6 +94,8 @@ public:
return ShiftLegalizationStrategy::LowerToLibcall;
}
+ bool softPromoteHalfType() const override { return true; }
+
private:
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
SelectionDAG &DAG, SDLoc dl) const;
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index b75417a..fbd1484 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -20,6 +20,7 @@
#include "AVR.h"
#include "AVRMachineFunctionInfo.h"
#include "AVRTargetObjectFile.h"
+#include "AVRTargetTransformInfo.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
#include "TargetInfo/AVRTargetInfo.h"
@@ -28,7 +29,7 @@
namespace llvm {
static const char *AVRDataLayout =
- "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
+ "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8:16-a:8";
/// Processes a CPU name.
static StringRef getCPU(StringRef CPU) {
@@ -62,7 +63,9 @@ namespace {
class AVRPassConfig : public TargetPassConfig {
public:
AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ EnableLoopTermFold = true;
+ }
AVRTargetMachine &getAVRTargetMachine() const {
return getTM<AVRTargetMachine>();
@@ -107,6 +110,11 @@ const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const {
return &SubTarget;
}
+TargetTransformInfo
+AVRTargetMachine::getTargetTransformInfo(const Function &F) const {
+ return TargetTransformInfo(std::make_unique<AVRTTIImpl>(this, F));
+}
+
MachineFunctionInfo *AVRTargetMachine::createMachineFunctionInfo(
BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const {
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.h b/llvm/lib/Target/AVR/AVRTargetMachine.h
index 167d007..9452b3d 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.h
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -48,6 +48,8 @@ public:
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+
bool isNoopAddrSpaceCast(unsigned SrcAs, unsigned DestAs) const override {
// While AVR has different address spaces, they are all represented by
// 16-bit pointers that can be freely casted between (of course, a pointer
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp
new file mode 100644
index 0000000..b1ef380
--- /dev/null
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp
@@ -0,0 +1,24 @@
+//===-- AVRTargetTransformInfo.cpp - AVR specific TTI ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetTransformInfo.h"
+
+using namespace llvm;
+
+bool AVRTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) const {
+ // AVR specific here are "instruction number 1st priority".
+ // If we need to emit adds inside the loop to add up base registers, then
+ // we need at least one extra temporary register.
+ unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
+ unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
+ return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, C1.NumIVMuls,
+ C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, C2.NumIVMuls,
+ C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
new file mode 100644
index 0000000..0daeeb8
--- /dev/null
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
@@ -0,0 +1,51 @@
+//===- AVRTargetTransformInfo.h - AVR specific TTI --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a TargetTransformInfoImplBase conforming object specific
+/// to the AVR target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
+
+#include "AVRSubtarget.h"
+#include "AVRTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Function.h"
+#include <optional>
+
+namespace llvm {
+
+class AVRTTIImpl final : public BasicTTIImplBase<AVRTTIImpl> {
+ using BaseT = BasicTTIImplBase<AVRTTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const AVRSubtarget *ST;
+ const AVRTargetLowering *TLI;
+
+ const AVRSubtarget *getST() const { return ST; }
+ const AVRTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit AVRTTIImpl(const AVRTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/AVR/CMakeLists.txt b/llvm/lib/Target/AVR/CMakeLists.txt
index 781dac0..a31c545 100644
--- a/llvm/lib/Target/AVR/CMakeLists.txt
+++ b/llvm/lib/Target/AVR/CMakeLists.txt
@@ -29,11 +29,13 @@ add_llvm_target(AVRCodeGen
AVRSubtarget.cpp
AVRTargetMachine.cpp
AVRTargetObjectFile.cpp
+ AVRTargetTransformInfo.cpp
DEPENDS
intrinsics_gen
LINK_COMPONENTS
+ Analysis
AVRDesc
AVRInfo
AsmPrinter
@@ -44,6 +46,8 @@ add_llvm_target(AVRCodeGen
SelectionDAG
Support
Target
+ TargetParser
+ TransformUtils
ADD_TO_COMPONENT
AVR
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index e55d9b2..7885d93 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -116,7 +116,7 @@ class CSKYConstantIslands : public MachineFunctionPass {
/// NewWaterList - The subset of WaterList that was created since the
/// previous iteration by inserting unconditional branches.
- SmallSet<MachineBasicBlock *, 4> NewWaterList;
+ SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
using water_iterator = std::vector<MachineBasicBlock *>::iterator;
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index 7070171..e5b4f6e 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -1329,10 +1329,7 @@ SDValue CSKYTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
// Prepare argument list to generate call.
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Load;
- Entry.Ty = CallTy;
- Args.push_back(Entry);
+ Args.emplace_back(Load, CallTy);
// Setup call to __tls_get_addr.
TargetLowering::CallLoweringInfo CLI(DAG);
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index c7c09ca..8100f94 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(DirectXCodeGen
DirectXInfo
DirectXPointerTypeAnalysis
FrontendHLSL
+ IPO
MC
ScalarOpts
SelectionDAG
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 26a113d..a1ef257 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -162,8 +162,7 @@ void DXContainerGlobals::addRootSignature(Module &M,
auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
- const std::optional<mcdxbc::RootSignatureDesc> &RS =
- RSA.getDescForFunction(EntryFunction);
+ const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
if (!RS)
return;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 492e078..c65ead4 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1108,11 +1108,11 @@ def RawBufferStore : DXILOp<140, rawBufferStore> {
def Dot2AddHalf : DXILOp<162, dot2AddHalf> {
let Doc = "2D half dot product with accumulate to float";
let intrinsics = [IntrinSelect<int_dx_dot2add>];
- let arguments = [FloatTy, HalfTy, HalfTy, HalfTy, HalfTy];
- let result = FloatTy;
- let overloads = [Overloads<DXIL1_0, []>];
- let stages = [Stages<DXIL1_0, [all_stages]>];
- let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+ let arguments = [OverloadTy, HalfTy, HalfTy, HalfTy, HalfTy];
+ let result = OverloadTy;
+ let overloads = [Overloads<DXIL1_4, [FloatTy]>];
+ let stages = [Stages<DXIL1_4, [all_stages]>];
+ let attributes = [Attributes<DXIL1_4, [ReadNone]>];
}
def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 5f331db..13e3408 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -20,13 +20,13 @@ using namespace llvm;
static bool finalizeLinkage(Module &M) {
bool MadeChange = false;
- // Convert private global variables to internal linkage.
- for (GlobalVariable &GV : M.globals()) {
- if (GV.hasPrivateLinkage()) {
+ // Convert private globals and external globals with no usage to internal
+ // linkage.
+ for (GlobalVariable &GV : M.globals())
+ if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) {
GV.setLinkage(GlobalValue::InternalLinkage);
MadeChange = true;
}
- }
SmallVector<Function *> Funcs;
diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
index 306db6a..695eacb 100644
--- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
+++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp
@@ -9,10 +9,13 @@
#include "DXILForwardHandleAccesses.h"
#include "DXILShaderFlags.h"
#include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/DXILResource.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsDirectX.h"
@@ -70,6 +73,7 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
DenseMap<GlobalVariable *, IntrinsicInst *> HandleMap;
SmallVector<LoadInst *> LoadsToProcess;
+ DenseMap<AllocaInst *, SmallVector<IntrinsicInst *>> LifeTimeIntrinsicMap;
for (BasicBlock &BB : F)
for (Instruction &Inst : BB)
if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
@@ -78,6 +82,14 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
case Intrinsic::dx_resource_handlefromimplicitbinding:
processHandle(II, HandleMap);
break;
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ if (II->arg_size() >= 1) {
+ Value *Ptr = II->getArgOperand(0);
+ if (auto *Alloca = dyn_cast<AllocaInst>(Ptr))
+ LifeTimeIntrinsicMap[Alloca].push_back(II);
+ }
+ break;
default:
continue;
}
@@ -98,8 +110,16 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) {
NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr);
GV = dyn_cast_or_null<GlobalVariable>(Loaded);
} else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) {
- for (auto &Use : NestedAlloca->uses()) {
- auto *Store = dyn_cast<StoreInst>(Use.getUser());
+
+ if (auto It = LifeTimeIntrinsicMap.find(NestedAlloca);
+ It != LifeTimeIntrinsicMap.end()) {
+ llvm::for_each(It->second,
+ [](IntrinsicInst *II) { II->eraseFromParent(); });
+ LifeTimeIntrinsicMap.erase(It);
+ }
+
+ for (auto *User : NestedAlloca->users()) {
+ auto *Store = dyn_cast<StoreInst>(User);
if (!Store)
continue;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 0ec15a6..bd421771 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -9,6 +9,7 @@
#include "DXILOpLowering.h"
#include "DXILConstants.h"
#include "DXILOpBuilder.h"
+#include "DXILRootSignature.h"
#include "DXILShaderFlags.h"
#include "DirectX.h"
#include "llvm/ADT/SmallVector.h"
@@ -746,7 +747,7 @@ public:
IRBuilder<> &IRB = OpBuilder.getIRB();
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);
- Value *Ptr = CI->getArgOperand(1);
+ Value *Ptr = CI->getArgOperand(0);
assert(Ptr->getType()->isPointerTy() &&
"Expected operand of lifetime intrinsic to be a pointer");
@@ -918,6 +919,7 @@ PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) {
PA.preserve<DXILResourceAnalysis>();
PA.preserve<DXILMetadataAnalysis>();
PA.preserve<ShaderFlagsAnalysis>();
+ PA.preserve<RootSignatureAnalysis>();
return PA;
}
@@ -945,6 +947,7 @@ public:
AU.addPreserved<DXILResourceWrapperPass>();
AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+ AU.addPreserved<RootSignatureAnalysisWrapper>();
}
};
char DXILOpLoweringLegacy::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index 398dcbb..be2c7d1 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "DXILPostOptimizationValidation.h"
+#include "DXILRootSignature.h"
#include "DXILShaderFlags.h"
#include "DirectX.h"
#include "llvm/ADT/SmallString.h"
@@ -17,13 +18,44 @@
#include "llvm/IR/IntrinsicsDirectX.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
#define DEBUG_TYPE "dxil-post-optimization-validation"
using namespace llvm;
using namespace llvm::dxil;
-namespace {
+static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) {
+ using namespace dxbc;
+ switch (RangeType) {
+ case DescriptorRangeType::SRV:
+ return ResourceClass::SRV;
+ case DescriptorRangeType::UAV:
+ return ResourceClass::UAV;
+ case DescriptorRangeType::CBV:
+ return ResourceClass::CBuffer;
+ case DescriptorRangeType::Sampler:
+ return ResourceClass::Sampler;
+ }
+ llvm_unreachable("Unknown DescriptorRangeType");
+}
+
+static ResourceClass toResourceClass(dxbc::RootParameterType Type) {
+ using namespace dxbc;
+ switch (Type) {
+ case RootParameterType::Constants32Bit:
+ return ResourceClass::CBuffer;
+ case RootParameterType::SRV:
+ return ResourceClass::SRV;
+ case RootParameterType::UAV:
+ return ResourceClass::UAV;
+ case RootParameterType::CBV:
+ return ResourceClass::CBuffer;
+ case dxbc::RootParameterType::DescriptorTable:
+ llvm_unreachable("DescriptorTable is not convertible to ResourceClass");
+ }
+ llvm_unreachable("Unknown RootParameterType");
+}
static void reportInvalidDirection(Module &M, DXILResourceMap &DRM) {
for (const auto &UAV : DRM.uavs()) {
@@ -63,9 +95,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1,
}
static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
- if (DRM.empty())
- return;
-
+ bool ErrorFound = false;
for (const auto &ResList :
{DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) {
if (ResList.empty())
@@ -77,15 +107,136 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
while (RI != ResList.end() &&
PrevRI->getBinding().overlapsWith(RI->getBinding())) {
reportOverlappingError(M, *PrevRI, *RI);
+ ErrorFound = true;
RI++;
}
PrevRI = CurrentRI;
}
}
+ assert(ErrorFound && "this function should be called only when if "
+ "DXILResourceBindingInfo::hasOverlapingBinding() is "
+ "true, yet no overlapping binding was found");
+}
+
+static void
+reportOverlappingRegisters(Module &M,
+ const llvm::hlsl::BindingInfoBuilder::Binding &R1,
+ const llvm::hlsl::BindingInfoBuilder::Binding &R2) {
+ SmallString<128> Message;
+
+ raw_svector_ostream OS(Message);
+ OS << "resource " << getResourceClassName(R1.RC) << " (space=" << R1.Space
+ << ", registers=[" << R1.LowerBound << ", " << R1.UpperBound
+ << "]) overlaps with resource " << getResourceClassName(R2.RC)
+ << " (space=" << R2.Space << ", registers=[" << R2.LowerBound << ", "
+ << R2.UpperBound << "])";
+ M.getContext().diagnose(DiagnosticInfoGeneric(Message));
+}
+
+static dxbc::ShaderVisibility
+tripleToVisibility(llvm::Triple::EnvironmentType ET) {
+ switch (ET) {
+ case Triple::Pixel:
+ return dxbc::ShaderVisibility::Pixel;
+ case Triple::Vertex:
+ return dxbc::ShaderVisibility::Vertex;
+ case Triple::Geometry:
+ return dxbc::ShaderVisibility::Geometry;
+ case Triple::Hull:
+ return dxbc::ShaderVisibility::Hull;
+ case Triple::Domain:
+ return dxbc::ShaderVisibility::Domain;
+ case Triple::Mesh:
+ return dxbc::ShaderVisibility::Mesh;
+ case Triple::Compute:
+ return dxbc::ShaderVisibility::All;
+ default:
+ llvm_unreachable("Invalid triple to shader stage conversion");
+ }
+}
+
+static void validateRootSignature(Module &M,
+ const mcdxbc::RootSignatureDesc &RSD,
+ dxil::ModuleMetadataInfo &MMI) {
+
+ hlsl::BindingInfoBuilder Builder;
+ dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile);
+
+ for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) {
+ dxbc::ShaderVisibility ParamVisibility =
+ static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility);
+ if (ParamVisibility != dxbc::ShaderVisibility::All &&
+ ParamVisibility != Visibility)
+ continue;
+ dxbc::RootParameterType ParamType =
+ static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType);
+ switch (ParamType) {
+ case dxbc::RootParameterType::Constants32Bit: {
+ dxbc::RTS0::v1::RootConstants Const =
+ RSD.ParametersContainer.getConstant(ParamInfo.Location);
+ Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace,
+ Const.ShaderRegister, Const.ShaderRegister,
+ &ParamInfo);
+ break;
+ }
+
+ case dxbc::RootParameterType::SRV:
+ case dxbc::RootParameterType::UAV:
+ case dxbc::RootParameterType::CBV: {
+ dxbc::RTS0::v2::RootDescriptor Desc =
+ RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location);
+ Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>(
+ ParamInfo.Header.ParameterType)),
+ Desc.RegisterSpace, Desc.ShaderRegister,
+ Desc.ShaderRegister, &ParamInfo);
+
+ break;
+ }
+ case dxbc::RootParameterType::DescriptorTable: {
+ const mcdxbc::DescriptorTable &Table =
+ RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location);
+
+ for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) {
+ uint32_t UpperBound =
+ Range.NumDescriptors == ~0U
+ ? Range.BaseShaderRegister
+ : Range.BaseShaderRegister + Range.NumDescriptors - 1;
+ Builder.trackBinding(
+ toResourceClass(
+ static_cast<dxbc::DescriptorRangeType>(Range.RangeType)),
+ Range.RegisterSpace, Range.BaseShaderRegister, UpperBound,
+ &ParamInfo);
+ }
+ break;
+ }
+ }
+ }
+
+ for (const dxbc::RTS0::v1::StaticSampler &S : RSD.StaticSamplers)
+ Builder.trackBinding(dxil::ResourceClass::Sampler, S.RegisterSpace,
+ S.ShaderRegister, S.ShaderRegister, &S);
+
+ Builder.calculateBindingInfo(
+ [&M](const llvm::hlsl::BindingInfoBuilder &Builder,
+ const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) {
+ const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping =
+ Builder.findOverlapping(ReportedBinding);
+ reportOverlappingRegisters(M, ReportedBinding, Overlaping);
+ });
+}
+
+static mcdxbc::RootSignatureDesc *
+getRootSignature(RootSignatureBindingInfo &RSBI,
+ dxil::ModuleMetadataInfo &MMI) {
+ if (MMI.EntryPropertyVec.size() == 0)
+ return nullptr;
+ return RSBI.getDescForFunction(MMI.EntryPropertyVec[0].Entry);
}
static void reportErrors(Module &M, DXILResourceMap &DRM,
- DXILResourceBindingInfo &DRBI) {
+ DXILResourceBindingInfo &DRBI,
+ RootSignatureBindingInfo &RSBI,
+ dxil::ModuleMetadataInfo &MMI) {
if (DRM.hasInvalidCounterDirection())
reportInvalidDirection(M, DRM);
@@ -94,14 +245,19 @@ static void reportErrors(Module &M, DXILResourceMap &DRM,
assert(!DRBI.hasImplicitBinding() && "implicit bindings should be handled in "
"DXILResourceImplicitBinding pass");
+
+ if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI))
+ validateRootSignature(M, *RSD, MMI);
}
-} // namespace
PreservedAnalyses
DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) {
DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M);
DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M);
- reportErrors(M, DRM, DRBI);
+ RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M);
+ ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M);
+
+ reportErrors(M, DRM, DRBI, RSBI, MMI);
return PreservedAnalyses::all();
}
@@ -113,7 +269,12 @@ public:
getAnalysis<DXILResourceWrapperPass>().getResourceMap();
DXILResourceBindingInfo &DRBI =
getAnalysis<DXILResourceBindingWrapperPass>().getBindingInfo();
- reportErrors(M, DRM, DRBI);
+ RootSignatureBindingInfo &RSBI =
+ getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
+ dxil::ModuleMetadataInfo &MMI =
+ getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+
+ reportErrors(M, DRM, DRBI, RSBI, MMI);
return false;
}
StringRef getPassName() const override {
@@ -125,10 +286,13 @@ public:
void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
AU.addRequired<DXILResourceWrapperPass>();
AU.addRequired<DXILResourceBindingWrapperPass>();
+ AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+ AU.addRequired<RootSignatureAnalysisWrapper>();
AU.addPreserved<DXILResourceWrapperPass>();
AU.addPreserved<DXILResourceBindingWrapperPass>();
AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+ AU.addPreserved<RootSignatureAnalysisWrapper>();
}
};
char DXILPostOptimizationValidationLegacy::ID = 0;
@@ -139,6 +303,8 @@ INITIALIZE_PASS_BEGIN(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
"DXIL Post Optimization Validation", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index 254b7ff..b990b6c 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -43,13 +43,11 @@ public:
iterator end() { return FuncToRsMap.end(); }
- std::optional<mcdxbc::RootSignatureDesc>
- getDescForFunction(const Function *F) {
+ mcdxbc::RootSignatureDesc *getDescForFunction(const Function *F) {
const auto FuncRs = find(F);
if (FuncRs == end())
- return std::nullopt;
-
- return FuncRs->second;
+ return nullptr;
+ return &FuncRs->second;
}
};
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index 1bd5dd7..1eb03bf 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -13,11 +13,15 @@
#include "DXILWriterPass.h"
#include "DXILBitcodeWriter.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
@@ -54,49 +58,81 @@ public:
};
static void legalizeLifetimeIntrinsics(Module &M) {
- for (Function &F : M) {
- Intrinsic::ID IID = F.getIntrinsicID();
- if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ LLVMContext &Ctx = M.getContext();
+ Type *I64Ty = IntegerType::get(Ctx, 64);
+ Type *PtrTy = PointerType::get(Ctx, 0);
+ Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start,
+ Intrinsic::lifetime_end};
+ for (Intrinsic::ID &IID : LifetimeIIDs) {
+ Function *F = M.getFunction(Intrinsic::getName(IID, {PtrTy}, &M));
+ if (!F)
continue;
- // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
- F.removeFnAttr(Attribute::Memory);
-
- // Lifetime intrinsics in LLVM 3.7 do not have mangled names
- F.setName(Intrinsic::getBaseName(IID));
-
- // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts
- // to ensure that is the case
- for (auto *User : make_early_inc_range(F.users())) {
- CallInst *CI = dyn_cast<CallInst>(User);
- assert(CI && "Expected user of a lifetime intrinsic function to be a "
- "lifetime intrinsic call");
- Value *PtrOperand = CI->getArgOperand(1);
- PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+ // Get or insert an LLVM 3.7-compliant lifetime intrinsic function of the
+ // form `void @llvm.lifetime.[start/end](i64, ptr)` with the NoUnwind
+ // attribute
+ AttributeList Attr;
+ Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind);
+ FunctionCallee LifetimeCallee = M.getOrInsertFunction(
+ Intrinsic::getBaseName(IID), Attr, Type::getVoidTy(Ctx), I64Ty, PtrTy);
+
+ // Replace all calls to lifetime intrinsics with calls to the
+ // LLVM 3.7-compliant version of the lifetime intrinsic
+ for (User *U : make_early_inc_range(F->users())) {
+ CallInst *CI = dyn_cast<CallInst>(U);
+ assert(CI &&
+ "Expected user of a lifetime intrinsic function to be a CallInst");
+
+ // LLVM 3.7 lifetime intrinics require an i8* operand, so we insert
+ // a bitcast to ensure that is the case
+ Value *PtrOperand = CI->getArgOperand(0);
+ PointerType *PtrOpPtrTy = cast<PointerType>(PtrOperand->getType());
Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand,
- PtrTy, "", CI->getIterator());
- CI->setArgOperand(1, NoOpBitCast);
+ PtrOpPtrTy, "", CI->getIterator());
+
+ // LLVM 3.7 lifetime intrinsics have an explicit size operand, whose value
+ // we can obtain from the pointer operand which must be an AllocaInst (as
+ // of https://github.com/llvm/llvm-project/pull/149310)
+ AllocaInst *AI = dyn_cast<AllocaInst>(PtrOperand);
+ assert(AI &&
+ "The pointer operand of a lifetime intrinsic call must be an "
+ "AllocaInst");
+ std::optional<TypeSize> AllocSize =
+ AI->getAllocationSize(CI->getDataLayout());
+ assert(AllocSize.has_value() &&
+ "Expected the allocation size of AllocaInst to be known");
+ CallInst *NewCI = CallInst::Create(
+ LifetimeCallee,
+ {ConstantInt::get(I64Ty, AllocSize.value().getFixedValue()),
+ NoOpBitCast},
+ "", CI->getIterator());
+ for (Attribute ParamAttr : CI->getParamAttributes(0))
+ NewCI->addParamAttr(1, ParamAttr);
+
+ CI->eraseFromParent();
}
+
+ F->eraseFromParent();
}
}
static void removeLifetimeIntrinsics(Module &M) {
- for (Function &F : make_early_inc_range(M)) {
- if (Intrinsic::ID IID = F.getIntrinsicID();
- IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start,
+ Intrinsic::lifetime_end};
+ for (Intrinsic::ID &IID : LifetimeIIDs) {
+ Function *F = M.getFunction(Intrinsic::getBaseName(IID));
+ if (!F)
continue;
- for (User *U : make_early_inc_range(F.users())) {
- LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U);
- assert(LI && "Expected user of lifetime intrinsic function to be "
- "a LifetimeIntrinsic instruction");
- BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1));
- assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a "
- "BitCastInst");
- LI->eraseFromParent();
+ for (User *U : make_early_inc_range(F->users())) {
+ CallInst *CI = dyn_cast<CallInst>(U);
+ assert(CI && "Expected user of lifetime function to be a CallInst");
+ BitCastInst *BCI = dyn_cast<BitCastInst>(CI->getArgOperand(1));
+ assert(BCI && "Expected pointer operand of CallInst to be a BitCastInst");
+ CI->eraseFromParent();
BCI->eraseFromParent();
}
- F.eraseFromParent();
+ F->eraseFromParent();
}
}
diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
index d506954..b4b48a16 100644
--- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def
+++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
@@ -24,6 +24,7 @@ MODULE_ANALYSIS("dxil-root-signature-analysis", dxil::RootSignatureAnalysis())
#define MODULE_PASS(NAME, CREATE_PASS)
#endif
MODULE_PASS("dxil-cbuffer-access", DXILCBufferAccess())
+MODULE_PASS("dxil-finalize-linkage", DXILFinalizeLinkage())
MODULE_PASS("dxil-data-scalarization", DXILDataScalarization())
MODULE_PASS("dxil-flatten-arrays", DXILFlattenArrays())
MODULE_PASS("dxil-intrinsic-expansion", DXILIntrinsicExpansion())
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 84751d2..f5d5a73 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -14,6 +14,7 @@
#include "DirectXTargetMachine.h"
#include "DXILCBufferAccess.h"
#include "DXILDataScalarization.h"
+#include "DXILFinalizeLinkage.h"
#include "DXILFlattenArrays.h"
#include "DXILForwardHandleAccesses.h"
#include "DXILIntrinsicExpansion.h"
@@ -45,6 +46,8 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/Scalarizer.h"
#include <optional>
@@ -62,6 +65,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
initializeEmbedDXILPassPass(*PR);
initializeWriteDXILPassPass(*PR);
initializeDXContainerGlobalsPass(*PR);
+ initializeGlobalDCELegacyPassPass(*PR);
initializeDXILOpLoweringLegacyPass(*PR);
initializeDXILResourceAccessLegacyPass(*PR);
initializeDXILResourceImplicitBindingLegacyPass(*PR);
@@ -72,6 +76,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
initializeDXILFinalizeLinkageLegacyPass(*PR);
initializeDXILPrettyPrinterLegacyPass(*PR);
initializeDXILForwardHandleAccessesLegacyPass(*PR);
+ initializeDSELegacyPassPass(*PR);
initializeDXILCBufferAccessLegacyPass(*PR);
}
@@ -103,6 +108,7 @@ public:
FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
void addCodeGenPrepare() override {
addPass(createDXILFinalizeLinkageLegacyPass());
+ addPass(createGlobalDCEPass());
addPass(createDXILResourceAccessLegacyPass());
addPass(createDXILIntrinsicExpansionLegacyPass());
addPass(createDXILCBufferAccessLegacyPass());
@@ -112,6 +118,7 @@ public:
addPass(createScalarizerPass(DxilScalarOptions));
addPass(createDXILFlattenArraysLegacyPass());
addPass(createDXILForwardHandleAccessesLegacyPass());
+ addPass(createDeadStoreEliminationPass());
addPass(createDXILLegalizeLegacyPass());
addPass(createDXILResourceImplicitBindingLegacyPass());
addPass(createDXILTranslateMetadataLegacyPass());
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 22cff7c..bcddb54 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -526,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
MI.insert(MI.begin() + 1,
MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
break;
+ case Hexagon::Y4_crswap10:
+ MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+ break;
default:
break;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index e0302b8..fd6d873 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -6,11 +6,6 @@
//
//===----------------------------------------------------------------------===//
-class CCIfArgIsVarArg<CCAction A>
- : CCIf<"State.isVarArg() && "
- "ValNo >= static_cast<HexagonCCState&>(State)"
- ".getNumNamedVarArgParams()", A>;
-
def CC_HexagonStack: CallingConv<[
CCIfType<[i32,v2i16,v4i8],
CCAssignToStack<4,4>>,
@@ -28,7 +23,7 @@ def CC_Hexagon_Legacy: CallingConv<[
CCIfByVal<
CCPassByVal<8,8>>,
- CCIfArgIsVarArg<
+ CCIfArgVarArg<
CCDelegateTo<CC_HexagonStack>>,
// Pass split values in pairs, allocate odd register if necessary.
@@ -58,7 +53,7 @@ def CC_Hexagon: CallingConv<[
CCIfByVal<
CCPassByVal<8,1>>,
- CCIfArgIsVarArg<
+ CCIfArgVarArg<
CCDelegateTo<CC_HexagonStack>>,
// Pass split values in pairs, allocate odd register if necessary.
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index a920146..b2218ab 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1273,7 +1273,7 @@ void HexagonGenInsert::selectCandidates() {
for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
using use_iterator = MachineRegisterInfo::use_nodbg_iterator;
- using InstrSet = SmallSet<const MachineInstr *, 16>;
+ using InstrSet = SmallPtrSet<const MachineInstr *, 16>;
InstrSet UIs;
// Count as the number of instructions in which R is used, not the
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index facea64..c54b67c 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -116,23 +116,6 @@ static cl::opt<bool>
cl::desc("Disable minimum alignment of 1 for "
"arguments passed by value on stack"));
-namespace {
-
- class HexagonCCState : public CCState {
- unsigned NumNamedVarArgParams = 0;
-
- public:
- HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
- SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
- unsigned NumNamedArgs)
- : CCState(CC, IsVarArg, MF, locs, C),
- NumNamedVarArgParams(NumNamedArgs) {}
- unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
- };
-
-} // end anonymous namespace
-
-
// Implement calling convention for Hexagon.
static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
@@ -497,7 +480,6 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFrameInfo &MFI = MF.getFrameInfo();
auto PtrVT = getPointerTy(MF.getDataLayout());
- unsigned NumParams = CLI.CB ? CLI.CB->getFunctionType()->getNumParams() : 0;
if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
@@ -506,8 +488,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext(),
- NumParams);
+ CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext());
if (Subtarget.useHVXOps())
CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
@@ -880,9 +861,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs,
- *DAG.getContext(),
- MF.getFunction().getFunctionType()->getNumParams());
+ CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext());
if (Subtarget.useHVXOps())
CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c34eecd..a3717bb 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2289,7 +2289,7 @@ CleanupAndExit:
// the instructions in Insts are removed.
bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
SmallVectorImpl<Instruction*> &Insts) const {
- SmallSet<BasicBlock*,8> LoopBlocks;
+ SmallPtrSet<BasicBlock *, 8> LoopBlocks;
LoopBlocks.insert_range(L->blocks());
SetVector<Instruction *> Worklist(llvm::from_range, Insts);
diff --git a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 610a81f..33aa6e4 100644
--- a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -32,14 +32,10 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
//
const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
- Entry.Node = Dst;
- Args.push_back(Entry);
- Entry.Node = Src;
- Args.push_back(Entry);
- Entry.Node = Size;
- Args.push_back(Entry);
+ Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Args.emplace_back(Dst, ArgTy);
+ Args.emplace_back(Src, ArgTy);
+ Args.emplace_back(Size, ArgTy);
const char *SpecialMemcpyName = TLI.getLibcallName(
RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES);
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index ecc1b5d..6a05b5a 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -445,8 +445,8 @@ void HexagonSubtarget::adjustSchedDependency(
const HexagonInstrInfo *QII = getInstrInfo();
// Instructions with .new operands have zero latency.
- SmallSet<SUnit *, 4> ExclSrc;
- SmallSet<SUnit *, 4> ExclDst;
+ SmallPtrSet<SUnit *, 4> ExclSrc;
+ SmallPtrSet<SUnit *, 4> ExclDst;
if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
Dep.setLatency(0);
@@ -630,9 +630,9 @@ static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
// together with a zero latency. Only one dependence should have a zero
// latency. If there are multiple choices, choose the best, and change
// the others, if needed.
-bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
- const HexagonInstrInfo *TII, SmallSet<SUnit*, 4> &ExclSrc,
- SmallSet<SUnit*, 4> &ExclDst) const {
+bool HexagonSubtarget::isBestZeroLatency(
+ SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
+ SmallPtrSet<SUnit *, 4> &ExclSrc, SmallPtrSet<SUnit *, 4> &ExclDst) const {
MachineInstr &SrcInst = *Src->getInstr();
MachineInstr &DstInst = *Dst->getInstr();
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 41555db..b111471 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -366,7 +366,8 @@ private:
void restoreLatency(SUnit *Src, SUnit *Dst) const;
void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const;
bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
- SmallSet<SUnit*, 4> &ExclSrc, SmallSet<SUnit*, 4> &ExclDst) const;
+ SmallPtrSet<SUnit *, 4> &ExclSrc,
+ SmallPtrSet<SUnit *, 4> &ExclDst) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 9fb7d47..171e294 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp,
- ScalarEvolution *SE,
- const SCEV *S) const {
+InstructionCost
+HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *S,
+ TTI::TargetCostKind CostKind) const {
return 0;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index af8dede7..dbf16c9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -111,8 +111,9 @@ public:
InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
- InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
- const SCEV *S) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 3de6df5..87d052b 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -1677,9 +1677,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
};
- const APInt *Qn = nullptr;
- if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) {
- Op.Frac = Qn->getZExtValue();
+ uint64_t Qn = 0;
+ if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
+ Op.Frac = Qn;
Exp = T;
} else {
Op.Frac = 0;
@@ -1689,9 +1689,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
return std::nullopt;
// Check if there is rounding added.
- const APInt *C = nullptr;
- if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) {
- uint64_t CV = C->getZExtValue();
+ uint64_t CV;
+ if (Value *T;
+ Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
if (CV != 0 && !isPowerOf2_64(CV))
return std::nullopt;
if (CV != 0)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 039ef4f..6b8d7f1 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -32,7 +32,6 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/HexagonAttributes.h"
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index d23c5f4..7a0a510 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -356,12 +356,13 @@ void LanaiTargetLowering::LowerAsmOperandForConstraint(
static unsigned NumFixedArgs;
static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
// Handle fixed arguments with default CC.
// Note: Both the default and fast CC handle VarArg the same and hence the
// calling convention of the function is not considered here.
if (ValNo < NumFixedArgs) {
- return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+ return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State);
}
// Promote i8/i16 args to i32
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6583a0f..5b2d185 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RuntimeLibcallUtil.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/IRBuilder.h"
@@ -2786,7 +2787,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op,
EVT RetVT = Op.getValueType();
RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT);
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -2811,7 +2812,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op,
EVT RetVT = Op.getValueType();
RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT);
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -3037,10 +3038,7 @@ SDValue LoongArchTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
// Prepare argument list to generate call.
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Load;
- Entry.Ty = CallTy;
- Args.push_back(Entry);
+ Args.emplace_back(Load, CallTy);
// Setup call to __tls_get_addr.
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -4107,7 +4105,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
LC = RTLIB::getFPTOSINT(Src.getValueType(), VT);
MakeLibCallOptions CallOptions;
EVT OpVT = Src.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, VT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, VT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -4360,7 +4358,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
RTLIB::Libcall LC =
OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
Results.push_back(Result);
@@ -6042,17 +6040,20 @@ static MachineBasicBlock *
emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
const LoongArchSubtarget &Subtarget) {
unsigned InsOp;
+ unsigned BroadcastOp;
unsigned HalfSize;
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected opcode");
case LoongArch::PseudoXVINSGR2VR_B:
HalfSize = 16;
- InsOp = LoongArch::VINSGR2VR_B;
+ BroadcastOp = LoongArch::XVREPLGR2VR_B;
+ InsOp = LoongArch::XVEXTRINS_B;
break;
case LoongArch::PseudoXVINSGR2VR_H:
HalfSize = 8;
- InsOp = LoongArch::VINSGR2VR_H;
+ BroadcastOp = LoongArch::XVREPLGR2VR_H;
+ InsOp = LoongArch::XVEXTRINS_H;
break;
}
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -6066,37 +6067,41 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
Register Elt = MI.getOperand(2).getReg();
unsigned Idx = MI.getOperand(3).getImm();
- Register ScratchReg1 = XSrc;
- if (Idx >= HalfSize) {
- ScratchReg1 = MRI.createVirtualRegister(RC);
- BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
- .addReg(XSrc)
- .addImm(14);
- }
+ if (XSrc.isVirtual() && MRI.getVRegDef(XSrc)->isImplicitDef() &&
+ Idx < HalfSize) {
+ Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
+ Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC);
- Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
- Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC);
- BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1)
- .addReg(ScratchReg1, 0, LoongArch::sub_128);
- BuildMI(*BB, MI, DL, TII->get(InsOp), ScratchSubReg2)
- .addReg(ScratchSubReg1)
- .addReg(Elt)
- .addImm(Idx >= HalfSize ? Idx - HalfSize : Idx);
+ BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1)
+ .addReg(XSrc, 0, LoongArch::sub_128);
+ BuildMI(*BB, MI, DL,
+ TII->get(HalfSize == 8 ? LoongArch::VINSGR2VR_H
+ : LoongArch::VINSGR2VR_B),
+ ScratchSubReg2)
+ .addReg(ScratchSubReg1)
+ .addReg(Elt)
+ .addImm(Idx);
+
+ BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), XDst)
+ .addImm(0)
+ .addReg(ScratchSubReg2)
+ .addImm(LoongArch::sub_128);
+ } else {
+ Register ScratchReg1 = MRI.createVirtualRegister(RC);
+ Register ScratchReg2 = MRI.createVirtualRegister(RC);
- Register ScratchReg2 = XDst;
- if (Idx >= HalfSize)
- ScratchReg2 = MRI.createVirtualRegister(RC);
+ BuildMI(*BB, MI, DL, TII->get(BroadcastOp), ScratchReg1).addReg(Elt);
- BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), ScratchReg2)
- .addImm(0)
- .addReg(ScratchSubReg2)
- .addImm(LoongArch::sub_128);
+ BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg2)
+ .addReg(ScratchReg1)
+ .addReg(XSrc)
+ .addImm(Idx >= HalfSize ? 48 : 18);
- if (Idx >= HalfSize)
- BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), XDst)
+ BuildMI(*BB, MI, DL, TII->get(InsOp), XDst)
.addReg(XSrc)
.addReg(ScratchReg2)
- .addImm(2);
+ .addImm((Idx >= HalfSize ? Idx - HalfSize : Idx) * 17);
+ }
MI.eraseFromParent();
return BB;
@@ -7073,7 +7078,8 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
if (LocVT == MVT::i32 || LocVT == MVT::i64) {
// Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, SpLim
// s0 s1 s2 s3 s4 s5 s6 s7 s8
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index d8bb16f..0696b11 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>;
defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
defm : PairInsertExtractPatV4<v4f64, f64>;
+def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)),
+ uimm3:$imm),
+ (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+
+def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)),
+ uimm2:$imm),
+ (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+
+def : Pat<(vector_insert v8i32:$xd,
+ (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1),
+ uimm3:$imm2)>;
+
+def : Pat<(vector_insert v4i64:$xd,
+ (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1),
+ uimm2:$imm2)>;
+
// PseudoXVINSGR2VR_{B/H}
def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
(PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index ca5d27d..3b38ac9 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -143,8 +143,6 @@ static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data,
void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
const MCValue &Target, uint8_t *Data,
uint64_t Value, bool IsResolved) {
- if (IsResolved && shouldForceRelocation(Fixup, Target))
- IsResolved = false;
IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
if (!Value)
return; // Doesn't change encoding.
@@ -176,20 +174,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
}
-bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
- const MCValue &Target) {
- switch (Fixup.getKind()) {
- default:
- return STI.hasFeature(LoongArch::FeatureRelax);
- case FK_Data_1:
- case FK_Data_2:
- case FK_Data_4:
- case FK_Data_8:
- case FK_Data_leb128:
- return !Target.isAbsolute();
- }
-}
-
static inline std::pair<MCFixupKind, MCFixupKind>
getRelocPairForSize(unsigned Size) {
switch (Size) {
@@ -216,10 +200,19 @@ getRelocPairForSize(unsigned Size) {
// size, the fixup encodes MaxBytesToEmit in the higher bits and references a
// per-section marker symbol.
bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+ // Alignments before the first linker-relaxable instruction have fixed sizes
+ // and do not require relocations. Alignments after a linker-relaxable
+ // instruction require a relocation, even if the STI specifies norelax.
+ //
+ // firstLinkerRelaxable is the layout order within the subsection, which may
+ // be smaller than the section's order. Therefore, alignments in a
+ // lower-numbered subsection may be unnecessarily treated as linker-relaxable.
+ auto *Sec = F.getParent();
+ if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable())
+ return false;
+
// Use default handling unless linker relaxation is enabled and the
// MaxBytesToEmit >= the nop size.
- if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
- return false;
const unsigned MinNopLen = 4;
unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit();
if (MaxBytesToEmit < MinNopLen)
@@ -254,8 +247,6 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
F.setVarFixups({Fixup});
F.setLinkerRelaxable();
- if (!F.getParent()->isLinkerRelaxable())
- F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder());
return true;
}
@@ -448,10 +439,10 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
isPCRelFixupResolved(Target.getSubSym(), F))
return Fallback();
- // In SecA == SecB case. If the linker relaxation is disabled, the
+ // In SecA == SecB case. If the section is not linker-relaxable, the
// FixedValue has already been calculated out in evaluateFixup,
// return true and avoid record relocations.
- if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax))
+ if (&SecA == &SecB && !SecA.isLinkerRelaxable())
return true;
}
@@ -484,9 +475,16 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
return false;
}
- IsResolved = Fallback();
// If linker relaxation is enabled and supported by the current relocation,
- // append a RELAX relocation.
+ // generate a relocation and then append a RELAX.
+ if (Fixup.isLinkerRelaxable())
+ IsResolved = false;
+ if (IsResolved && Fixup.isPCRel())
+ IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
+
+ if (!IsResolved)
+ Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
+
if (Fixup.isLinkerRelaxable()) {
auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_LARCH_RELAX);
Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
@@ -498,8 +496,7 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
std::unique_ptr<MCObjectTargetWriter>
LoongArchAsmBackend::createObjectTargetWriter() const {
- return createLoongArchELFObjectWriter(
- OSABI, Is64Bit, STI.hasFeature(LoongArch::FeatureRelax));
+ return createLoongArchELFObjectWriter(OSABI, Is64Bit);
}
MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 1f13601..f79d3aa 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -44,8 +44,6 @@ public:
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
uint8_t *Data, uint64_t Value, bool IsResolved) override;
- bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
-
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index 7e021e4..7d54565 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -21,26 +21,23 @@ using namespace llvm;
namespace {
class LoongArchELFObjectWriter : public MCELFObjectTargetWriter {
public:
- LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool EnableRelax);
+ LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
~LoongArchELFObjectWriter() override;
bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override {
- return EnableRelax;
+ return true;
}
protected:
unsigned getRelocType(const MCFixup &, const MCValue &,
bool IsPCRel) const override;
- bool EnableRelax;
};
} // end namespace
-LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit,
- bool EnableRelax)
+LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
: MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
- /*HasRelocationAddend=*/true),
- EnableRelax(EnableRelax) {}
+ /*HasRelocationAddend=*/true) {}
LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
@@ -103,6 +100,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
std::unique_ptr<MCObjectTargetWriter>
-llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax) {
- return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit, Relax);
+llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+ return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit);
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
index bb05baa..ab35a00 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
@@ -36,7 +36,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T,
const MCTargetOptions &Options);
std::unique_ptr<MCObjectTargetWriter>
-createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax);
+createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
} // end namespace llvm
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 594ea9f..12c6e1e 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -51,7 +51,9 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
MVT PtrVT = MVT::i32;
- setBooleanContents(ZeroOrOneBooleanContent);
+ // This is based on M68k SetCC (scc) setting the destination byte to all 1s.
+ // See also getSetCCResultType().
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
auto *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
@@ -1454,10 +1456,7 @@ SDValue M68kTargetLowering::getTLSGetAddr(GlobalAddressSDNode *GA,
PointerType *PtrTy = PointerType::get(*DAG.getContext(), 0);
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Arg;
- Entry.Ty = PtrTy;
- Args.push_back(Entry);
+ Args.emplace_back(Arg, PtrTy);
return LowerExternalSymbolCall(DAG, SDLoc(GA), "__tls_get_addr",
std::move(Args));
}
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index e2d4e49..56b71db 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -835,7 +835,7 @@ def : Pat<(MxSub 0, i8 :$src), (NEG8d MxDRD8 :$src)>;
def : Pat<(MxSub 0, i16:$src), (NEG16d MxDRD16:$src)>;
def : Pat<(MxSub 0, i32:$src), (NEG32d MxDRD32:$src)>;
// SExt of i1 values.
-// Although we specify `ZeroOrOneBooleanContent` for boolean content,
+// Although we specify `ZeroOrNegativeOneBooleanContent` for boolean content,
// we're still adding an AND here as we don't know the origin of the i1 value.
def : Pat<(sext_inreg i8:$src, i1), (NEG8d (AND8di MxDRD8:$src, 1))>;
def : Pat<(sext_inreg i16:$src, i1), (NEG16d (AND16di MxDRD16:$src, 1))>;
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index f4ed627..c5b7ae3 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -701,18 +701,22 @@ def: Pat<(MxExtLoadi16i8 MxCP_ARID:$src),
(EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
def: Pat<(MxExtLoadi16i8 MxCP_ARII:$src),
(EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxExtLoadi16i8 MxCP_PCD:$src),
+ (EXTRACT_SUBREG (MOVZXd32q8 MxPCD8:$src), MxSubRegIndex16Lo)>;
// i32 <- anyext i8
def: Pat<(i32 (anyext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>;
def: Pat<(MxExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>;
def: Pat<(MxExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>;
def: Pat<(MxExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>;
+def: Pat<(MxExtLoadi32i8 MxCP_PCD:$src), (MOVZXd32q8 MxPCD8:$src)>;
// i32 <- anyext i16
def: Pat<(i32 (anyext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>;
def: Pat<(MxExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>;
def: Pat<(MxExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>;
def: Pat<(MxExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>;
+def: Pat<(MxExtLoadi32i16 MxCP_PCD:$src), (MOVZXd32q16 MxPCD16:$src)>;
// trunc patterns
def : Pat<(i16 (trunc i32:$src)),
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index d23504c..6da5e66 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -377,6 +377,7 @@ static void AnalyzeArguments(CCState &State,
for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
MVT ArgVT = Args[ValNo].VT;
ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+ Type *OrigTy = Args[ValNo].OrigTy;
MVT LocVT = ArgVT;
CCValAssign::LocInfo LocInfo = CCValAssign::Full;
@@ -411,7 +412,8 @@ static void AnalyzeArguments(CCState &State,
RegsLeft -= 1;
UsedStack = true;
- CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+ CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy,
+ State);
} else if (Parts <= RegsLeft) {
for (unsigned j = 0; j < Parts; j++) {
MCRegister Reg = State.AllocateReg(RegList);
@@ -421,7 +423,8 @@ static void AnalyzeArguments(CCState &State,
} else {
UsedStack = true;
for (unsigned j = 0; j < Parts; j++)
- CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+ CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy,
+ State);
}
}
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 16247bd..680d279 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -17,7 +17,6 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index feeadc5e..a8b7c9e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -18,7 +18,6 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
using namespace llvm;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 5df70c4..1e1b970 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -26,7 +26,6 @@
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 4530fc6..ae91c97 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -51,7 +51,6 @@
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index 13237c5..86bb3e6 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -12,59 +12,6 @@
using namespace llvm;
-bool MipsCCState::isF128SoftLibCall(const char *CallSym) {
- const char *const LibCalls[] = {
- "__addtf3", "__divtf3", "__eqtf2", "__extenddftf2",
- "__extendsftf2", "__fixtfdi", "__fixtfsi", "__fixtfti",
- "__fixunstfdi", "__fixunstfsi", "__fixunstfti", "__floatditf",
- "__floatsitf", "__floattitf", "__floatunditf", "__floatunsitf",
- "__floatuntitf", "__getf2", "__gttf2", "__letf2",
- "__lttf2", "__multf3", "__netf2", "__powitf2",
- "__subtf3", "__trunctfdf2", "__trunctfsf2", "__unordtf2",
- "ceill", "copysignl", "cosl", "exp2l",
- "expl", "floorl", "fmal", "fmaxl",
- "fmodl", "log10l", "log2l", "logl",
- "nearbyintl", "powl", "rintl", "roundl",
- "sinl", "sqrtl", "truncl"};
-
- // Check that LibCalls is sorted alphabetically.
- auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
- assert(llvm::is_sorted(LibCalls, Comp));
- return llvm::binary_search(LibCalls, CallSym, Comp);
-}
-
-/// This function returns true if Ty is fp128, {f128} or i128 which was
-/// originally a fp128.
-bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) {
- if (Ty->isFP128Ty())
- return true;
-
- if (Ty->isStructTy() && Ty->getStructNumElements() == 1 &&
- Ty->getStructElementType(0)->isFP128Ty())
- return true;
-
- // If the Ty is i128 and the function being called is a long double emulation
- // routine, then the original type is f128.
- // FIXME: This is unsound because these functions could be indirectly called
- return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func));
-}
-
-/// Return true if the original type was vXfXX.
-bool MipsCCState::originalEVTTypeIsVectorFloat(EVT Ty) {
- if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint())
- return true;
-
- return false;
-}
-
-/// Return true if the original type was vXfXX / vXfXX.
-bool MipsCCState::originalTypeIsVectorFloat(const Type *Ty) {
- if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy())
- return true;
-
- return false;
-}
-
MipsCCState::SpecialCallingConvType
MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
const MipsSubtarget &Subtarget) {
@@ -81,123 +28,3 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
}
return SpecialCallingConv;
}
-
-void MipsCCState::PreAnalyzeCallResultForF128(
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const Type *RetTy, const char *Call) {
- for (unsigned i = 0; i < Ins.size(); ++i) {
- OriginalArgWasF128.push_back(
- originalTypeIsF128(RetTy, Call));
- OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
- }
-}
-
-/// Identify lowered values that originated from f128 or float arguments and
-/// record this for use by RetCC_MipsN.
-void MipsCCState::PreAnalyzeCallReturnForF128(
- const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy) {
- for (unsigned i = 0; i < Outs.size(); ++i) {
- OriginalArgWasF128.push_back(
- originalTypeIsF128(RetTy, nullptr));
- OriginalArgWasFloat.push_back(
- RetTy->isFloatingPointTy());
- }
-}
-
-/// Identify lower values that originated from vXfXX and record
-/// this.
-void MipsCCState::PreAnalyzeCallResultForVectorFloat(
- const SmallVectorImpl<ISD::InputArg> &Ins, const Type *RetTy) {
- for (unsigned i = 0; i < Ins.size(); ++i) {
- OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
- }
-}
-
-/// Identify lowered values that originated from vXfXX arguments and record
-/// this.
-void MipsCCState::PreAnalyzeReturnForVectorFloat(
- const SmallVectorImpl<ISD::OutputArg> &Outs) {
- for (unsigned i = 0; i < Outs.size(); ++i) {
- ISD::OutputArg Out = Outs[i];
- OriginalRetWasFloatVector.push_back(
- originalEVTTypeIsVectorFloat(Out.ArgVT));
- }
-}
-
-void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) {
- OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT));
-}
-
-void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) {
- OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func));
- OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
- OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
-}
-
-/// Identify lowered values that originated from f128, float and sret to vXfXX
-/// arguments and record this.
-void MipsCCState::PreAnalyzeCallOperands(
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- std::vector<TargetLowering::ArgListEntry> &FuncArgs,
- const char *Func) {
- for (unsigned i = 0; i < Outs.size(); ++i) {
- TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex];
-
- OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
- OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
- OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
- }
-}
-
-void MipsCCState::PreAnalyzeFormalArgument(const Type *ArgTy,
- ISD::ArgFlagsTy Flags) {
- // SRet arguments cannot originate from f128 or {f128} returns so we just
- // push false. We have to handle this specially since SRet arguments
- // aren't mapped to an original argument.
- if (Flags.isSRet()) {
- OriginalArgWasF128.push_back(false);
- OriginalArgWasFloat.push_back(false);
- OriginalArgWasFloatVector.push_back(false);
- return;
- }
-
- OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, nullptr));
- OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy());
-
- // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
- // first argument is actually an SRet pointer to a vector, then the next
- // argument slot is $a2.
- OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy());
-}
-
-/// Identify lowered values that originated from f128, float and vXfXX arguments
-/// and record this.
-void MipsCCState::PreAnalyzeFormalArgumentsForF128(
- const SmallVectorImpl<ISD::InputArg> &Ins) {
- const MachineFunction &MF = getMachineFunction();
- for (unsigned i = 0; i < Ins.size(); ++i) {
- Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
-
- // SRet arguments cannot originate from f128 or {f128} returns so we just
- // push false. We have to handle this specially since SRet arguments
- // aren't mapped to an original argument.
- if (Ins[i].Flags.isSRet()) {
- OriginalArgWasF128.push_back(false);
- OriginalArgWasFloat.push_back(false);
- OriginalArgWasFloatVector.push_back(false);
- continue;
- }
-
- assert(Ins[i].getOrigArgIndex() < MF.getFunction().arg_size());
- std::advance(FuncArg, Ins[i].getOrigArgIndex());
-
- OriginalArgWasF128.push_back(
- originalTypeIsF128(FuncArg->getType(), nullptr));
- OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
-
- // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
- // first argument is actually an SRet pointer to a vector, then the next
- // argument slot is $a2.
- OriginalArgWasFloatVector.push_back(FuncArg->getType()->isVectorTy());
- }
-}
diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h
index 30b68e8..4c36d42 100644
--- a/llvm/lib/Target/Mips/MipsCCState.h
+++ b/llvm/lib/Target/Mips/MipsCCState.h
@@ -26,66 +26,7 @@ public:
getSpecialCallingConvForCallee(const SDNode *Callee,
const MipsSubtarget &Subtarget);
- /// This function returns true if CallSym is a long double emulation routine.
- ///
- /// FIXME: Changing the ABI based on the callee name is unsound. The lib func
- /// address could be captured.
- static bool isF128SoftLibCall(const char *CallSym);
-
- static bool originalTypeIsF128(const Type *Ty, const char *Func);
- static bool originalEVTTypeIsVectorFloat(EVT Ty);
- static bool originalTypeIsVectorFloat(const Type *Ty);
-
- void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func);
-
- void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags);
- void PreAnalyzeReturnValue(EVT ArgVT);
-
private:
- /// Identify lowered values that originated from f128 arguments and record
- /// this for use by RetCC_MipsN.
- void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
- const Type *RetTy, const char * Func);
-
- /// Identify lowered values that originated from f128 arguments and record
- /// this for use by RetCC_MipsN.
- void PreAnalyzeCallReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy);
-
- /// Identify lowered values that originated from f128 arguments and record
- /// this.
- void
- PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- std::vector<TargetLowering::ArgListEntry> &FuncArgs,
- const char *Func);
-
- /// Identify lowered values that originated from f128 arguments and record
- /// this for use by RetCC_MipsN.
- void
- PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
-
- void
- PreAnalyzeCallResultForVectorFloat(const SmallVectorImpl<ISD::InputArg> &Ins,
- const Type *RetTy);
-
- void PreAnalyzeFormalArgumentsForVectorFloat(
- const SmallVectorImpl<ISD::InputArg> &Ins);
-
- void
- PreAnalyzeReturnForVectorFloat(const SmallVectorImpl<ISD::OutputArg> &Outs);
-
- /// Records whether the value has been lowered from an f128.
- SmallVector<bool, 4> OriginalArgWasF128;
-
- /// Records whether the value has been lowered from float.
- SmallVector<bool, 4> OriginalArgWasFloat;
-
- /// Records whether the value has been lowered from a floating point vector.
- SmallVector<bool, 4> OriginalArgWasFloatVector;
-
- /// Records whether the return value has been lowered from a floating point
- /// vector.
- SmallVector<bool, 4> OriginalRetWasFloatVector;
-
// Used to handle MIPS16-specific calling convention tweaks.
// FIXME: This should probably be a fully fledged calling convention.
SpecialCallingConvType SpecialCallingConv;
@@ -96,118 +37,6 @@ public:
SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
: CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
- void PreAnalyzeCallOperands(
- const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn,
- std::vector<TargetLowering::ArgListEntry> &FuncArgs, const char *Func) {
- OriginalArgWasF128.clear();
- OriginalArgWasFloat.clear();
- OriginalArgWasFloatVector.clear();
- PreAnalyzeCallOperands(Outs, FuncArgs, Func);
- }
-
- void
- AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn,
- std::vector<TargetLowering::ArgListEntry> &FuncArgs,
- const char *Func) {
- PreAnalyzeCallOperands(Outs, Fn, FuncArgs, Func);
- CCState::AnalyzeCallOperands(Outs, Fn);
- }
-
- // The AnalyzeCallOperands in the base class is not usable since we must
- // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
- // class. This doesn't stop them being used via the base class though.
- void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) = delete;
- void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
- SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
- CCAssignFn Fn) = delete;
-
- void PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
- PreAnalyzeFormalArgumentsForF128(Ins);
- }
-
- void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- PreAnalyzeFormalArguments(Ins, Fn);
- CCState::AnalyzeFormalArguments(Ins, Fn);
- }
-
- void PreAnalyzeCallResult(const Type *RetTy, const char *Func) {
- OriginalArgWasF128.push_back(originalTypeIsF128(RetTy, Func));
- OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
- OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
- }
-
- void PreAnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn, const Type *RetTy,
- const char *Func) {
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
- PreAnalyzeCallResultForF128(Ins, RetTy, Func);
- PreAnalyzeCallResultForVectorFloat(Ins, RetTy);
- }
-
- void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn, const Type *RetTy,
- const char *Func) {
- PreAnalyzeCallResult(Ins, Fn, RetTy, Func);
- CCState::AnalyzeCallResult(Ins, Fn);
- }
-
- void PreAnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) {
- const MachineFunction &MF = getMachineFunction();
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
- PreAnalyzeCallReturnForF128(Outs, MF.getFunction().getReturnType());
- PreAnalyzeReturnForVectorFloat(Outs);
- }
-
- void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) {
- PreAnalyzeReturn(Outs, Fn);
- CCState::AnalyzeReturn(Outs, Fn);
- }
-
- bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
- CCAssignFn Fn) {
- const MachineFunction &MF = getMachineFunction();
- PreAnalyzeCallReturnForF128(ArgsFlags, MF.getFunction().getReturnType());
- PreAnalyzeReturnForVectorFloat(ArgsFlags);
- bool Return = CCState::CheckReturn(ArgsFlags, Fn);
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
- return Return;
- }
-
- bool CheckCallReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
- CCAssignFn Fn, const Type *RetTy) {
- PreAnalyzeCallReturnForF128(ArgsFlags, RetTy);
- PreAnalyzeReturnForVectorFloat(ArgsFlags);
- bool Return = CCState::CheckReturn(ArgsFlags, Fn);
- OriginalArgWasFloat.clear();
- OriginalArgWasF128.clear();
- OriginalArgWasFloatVector.clear();
- return Return;
- }
- bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
- bool WasOriginalArgFloat(unsigned ValNo) {
- return OriginalArgWasFloat[ValNo];
- }
- bool WasOriginalArgVectorFloat(unsigned ValNo) const {
- return OriginalArgWasFloatVector[ValNo];
- }
- bool WasOriginalRetVectorFloat(unsigned ValNo) const {
- return OriginalRetWasFloatVector[ValNo];
- }
SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
};
}
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index fa49108..35194e7 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -26,62 +26,6 @@ MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
: CallLowering(&TLI) {}
namespace {
-struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
- /// This is the name of the function being called
- /// FIXME: Relying on this is unsound
- const char *Func = nullptr;
-
- /// Is this a return value, or an outgoing call operand.
- bool IsReturn;
-
- MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
- bool IsReturn)
- : OutgoingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
-
- bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
- CCState &State_) override {
- MipsCCState &State = static_cast<MipsCCState &>(State_);
-
- if (IsReturn)
- State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty));
- else
- State.PreAnalyzeCallOperand(Info.Ty, Func);
-
- return CallLowering::OutgoingValueAssigner::assignArg(
- ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
- }
-};
-
-struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
- /// This is the name of the function being called
- /// FIXME: Relying on this is unsound
- const char *Func = nullptr;
-
- /// Is this a call return value, or an incoming function argument.
- bool IsReturn;
-
- MipsIncomingValueAssigner(CCAssignFn *AssignFn_, const char *Func,
- bool IsReturn)
- : IncomingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {}
-
- bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
- CCState &State_) override {
- MipsCCState &State = static_cast<MipsCCState &>(State_);
-
- if (IsReturn)
- State.PreAnalyzeCallResult(Info.Ty, Func);
- else
- State.PreAnalyzeFormalArgument(Info.Ty, Flags);
-
- return CallLowering::IncomingValueAssigner::assignArg(
- ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State);
- }
-};
-
class MipsIncomingValueHandler : public CallLowering::IncomingValueHandler {
const MipsSubtarget &STI;
@@ -339,9 +283,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
F.getContext());
MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
- std::string FuncName = F.getName().str();
- MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
- FuncName.c_str(), /*IsReturn*/ true);
+ OutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn());
if (!determineAssignments(Assigner, RetInfos, CCInfo))
return false;
@@ -392,9 +334,7 @@ bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
Align(1));
- const std::string FuncName = F.getName().str();
- MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), FuncName.c_str(),
- /*IsReturn*/ false);
+ IncomingValueAssigner Assigner(TLI.CCAssignFnForCall());
if (!determineAssignments(Assigner, ArgInfos, CCInfo))
return false;
@@ -510,11 +450,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv),
Align(1));
- const char *Call =
- Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
-
- MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), Call,
- /*IsReturn*/ false);
+ OutgoingValueAssigner Assigner(TLI.CCAssignFnForCall());
if (!determineAssignments(Assigner, ArgInfos, CCInfo))
return false;
@@ -550,11 +486,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLowering::splitToValueTypes(Info.OrigRet, ArgInfos, DL,
F.getCallingConv());
- const std::string FuncName = F.getName().str();
SmallVector<CCValAssign, 8> ArgLocs;
- MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForReturn(),
- FuncName.c_str(),
- /*IsReturn*/ true);
+ IncomingValueAssigner Assigner(TLI.CCAssignFnForReturn());
CallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 0e5c16c..3501f9fb 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -20,19 +20,15 @@ class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
/// Match if the original argument (before lowering) was a float.
/// For example, this is true for i32's that were lowered from soft-float.
-class CCIfOrigArgWasFloat<CCAction A>
- : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
- A>;
+class CCIfOrigArgWasFloat<CCAction A> : CCIf<"OrigTy->isFloatingPointTy()", A>;
/// Match if the original argument (before lowering) was a 128-bit float (i.e.
/// long double).
-class CCIfOrigArgWasF128<CCAction A>
- : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
+class CCIfOrigArgWasF128<CCAction A> : CCIf<"OrigTy->isFP128Ty()", A>;
-/// Match if the return was a floating point vector.
+/// Match if the return was not a floating point vector.
class CCIfOrigArgWasNotVectorFloat<CCAction A>
- : CCIf<"!static_cast<MipsCCState *>(&State)"
- "->WasOriginalRetVectorFloat(ValNo)", A>;
+ : CCIf<"!OrigTy->isVectorTy() || !OrigTy->isFPOrFPVectorTy()", A>;
/// Match if the special calling conv is the specified value.
class CCIfSpecialCallingConv<string CC, CCAction A>
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 8067dbc..2a2ccf7 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -232,7 +232,7 @@ namespace {
/// NewWaterList - The subset of WaterList that was created since the
/// previous iteration by inserting unconditional branches.
- SmallSet<MachineBasicBlock*, 4> NewWaterList;
+ SmallPtrSet<MachineBasicBlock *, 4> NewWaterList;
using water_iterator = std::vector<MachineBasicBlock *>::iterator;
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index f3812d1..1ce8d7e3 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -266,17 +266,19 @@ public:
static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State) LLVM_ATTRIBUTE_UNUSED;
+ Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
llvm_unreachable("should not be called");
}
static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
llvm_unreachable("should not be called");
}
@@ -1144,8 +1146,12 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
unsigned &NumBytes) {
CallingConv::ID CC = CLI.CallConv;
SmallVector<CCValAssign, 16> ArgLocs;
+ SmallVector<Type *, 16> ArgTys;
+ for (const ArgListEntry &Arg : CLI.Args)
+ ArgTys.push_back(Arg.Val->getType());
CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
- CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+ CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, ArgTys,
+ CCAssignFnForCall(CC));
// Get a count of how many bytes are to be pushed on the stack.
NumBytes = CCInfo.getStackSize();
// This is the minimum argument area used for A0-A3.
@@ -1287,9 +1293,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
SmallVector<CCValAssign, 16> RVLocs;
MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy,
- CLI.Symbol ? CLI.Symbol->getName().data()
- : nullptr);
+ CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips);
// Only handle a single return value.
if (RVLocs.size() != 1)
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 881ba8e..1491300 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2325,10 +2325,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Argument;
- Entry.Ty = PtrTy;
- Args.push_back(Entry);
+ Args.emplace_back(Argument, PtrTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL)
@@ -3040,14 +3037,13 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, ArrayRef<MCPhysReg> F64Regs) {
+ Type *OrigTy, CCState &State,
+ ArrayRef<MCPhysReg> F64Regs) {
const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
State.getMachineFunction().getSubtarget());
static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
- const MipsCCState * MipsState = static_cast<MipsCCState *>(&State);
-
static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 };
@@ -3089,7 +3085,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
State.getFirstUnallocated(F32Regs) != ValNo;
Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
bool isI64 = (ValVT == MVT::i32 && OrigAlign == Align(8));
- bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);
+ bool isVectorFloat = OrigTy->isVectorTy() && OrigTy->isFPOrFPVectorTy();
// The MIPS vector ABI for floats passes them in a pair of registers
if (ValVT == MVT::i32 && isVectorFloat) {
@@ -3160,25 +3156,29 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
}
-static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
- return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+ return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State,
+ F64Regs);
}
-static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
- return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+ return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State,
+ F64Regs);
}
static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State) LLVM_ATTRIBUTE_UNUSED;
+ Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
#include "MipsGenCallingConv.inc"
@@ -3392,8 +3392,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv);
CCInfo.AllocateStack(ReservedArgArea, Align(1));
- CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
- ES ? ES->getSymbol() : nullptr);
+ CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
// Get a count of how many bytes are to be pushed on the stack.
unsigned StackSize = CCInfo.getStackSize();
@@ -3688,10 +3687,7 @@ SDValue MipsTargetLowering::LowerCallResult(
MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
- const ExternalSymbolSDNode *ES =
- dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode());
- CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy,
- ES ? ES->getSymbol() : nullptr);
+ CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -3969,7 +3965,7 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
LLVMContext &Context, const Type *RetTy) const {
SmallVector<CCValAssign, 16> RVLocs;
MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
- return CCInfo.CheckCallReturn(Outs, RetCC_Mips, RetTy);
+ return CCInfo.CheckReturn(Outs, RetCC_Mips);
}
bool MipsTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
@@ -4408,7 +4404,7 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'K': // unsigned 16 bit immediate
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
EVT Type = Op.getValueType();
- uint64_t Val = (uint64_t)C->getZExtValue();
+ uint64_t Val = C->getZExtValue();
if (isUInt<16>(Val)) {
Result = DAG.getTargetConstant(Val, DL, Type);
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38912a7..0c581dcc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
// Map the global virtual register number to a register class specific
// virtual register number starting from 1 with that class.
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- //unsigned numRegClasses = TRI->getNumRegClasses();
// Emit the Fake Stack Object
const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1479,13 +1478,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
// global virtual
// register number and the per class virtual register number.
// We use the per class virtual register number in the ptx output.
- unsigned int numVRs = MRI->getNumVirtRegs();
- for (unsigned i = 0; i < numVRs; i++) {
- Register vr = Register::index2VirtReg(i);
- const TargetRegisterClass *RC = MRI->getRegClass(vr);
- DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
- int n = regmap.size();
- regmap.insert(std::make_pair(vr, n + 1));
+ for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) {
+ Register VR = Register::index2VirtReg(I);
+ if (MRI->use_empty(VR) && MRI->def_empty(VR))
+ continue;
+ auto &RCRegMap = VRegMapping[MRI->getRegClass(VR)];
+ RCRegMap[VR] = RCRegMap.size() + 1;
}
// Emit declaration of the virtual registers or 'physical' registers for
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6068035..520ce4d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
}
bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
- return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
+ return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
}
bool NVPTXDAGToDAGISel::useF32FTZ() const {
@@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
return TL->allowFMA(*MF, OptLevel);
}
-bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
- const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
- return TL->allowUnsafeFPMath(*MF);
-}
-
bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
/// Select - Select instructions not customized! Used for
@@ -1027,6 +1022,72 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
}
}
+static inline bool isAddLike(const SDValue V) {
+ return V.getOpcode() == ISD::ADD ||
+ (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
+static SDValue stripAssertAlign(SDValue N) {
+ if (N.getOpcode() == ISD::AssertAlign)
+ N = N.getOperand(0);
+ return N;
+}
+
+// selectBaseADDR - Match a dag node which will serve as the base address for an
+// ADDR operand pair.
+static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
+ N = stripAssertAlign(N);
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
+ return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
+ GA->getValueType(0), GA->getOffset(),
+ GA->getTargetFlags());
+ if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
+ return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
+ ES->getTargetFlags());
+ if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
+ return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
+
+ return N;
+}
+
+static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
+ Addr = stripAssertAlign(Addr);
+ APInt AccumulatedOffset(64u, 0);
+ while (isAddLike(Addr)) {
+ const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (!CN)
+ break;
+
+ const APInt CI = CN->getAPIntValue().sext(64);
+ if (!(CI + AccumulatedOffset).isSignedIntN(32))
+ break;
+
+ AccumulatedOffset += CI;
+ Addr = stripAssertAlign(Addr->getOperand(0));
+ }
+ return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
+ MVT::i32);
+}
+
+static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
+ SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
+ SDValue Base = selectBaseADDR(Addr, DAG);
+ return {Base, Offset};
+}
+
+// Select a pair of operands which represent a valid PTX address, this could be
+// one of the following things:
+// - [var] - Offset is simply set to 0
+// - [reg] - Offset is simply set to 0
+// - [reg+immOff]
+// - [var+immOff]
+// Note that immOff must fit into a 32-bit signed integer.
+bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
+ return true;
+}
+
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
MemSDNode *LD = cast<MemSDNode>(N);
assert(LD->readMem() && "Expected load");
@@ -1062,8 +1123,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
FromTypeWidth <= 128 && "Invalid width for load");
// Create the machine instruction DAG
- SDValue Offset, Base;
- SelectADDR(N->getOperand(1), Base, Offset);
+ const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
@@ -1144,8 +1204,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
- SDValue Offset, Base;
- SelectADDR(N->getOperand(1), Base, Offset);
+ const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
@@ -1213,8 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
- SDValue Base, Offset;
- SelectADDR(LD->getOperand(1), Base, Offset);
+ const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
Offset, LD->getChain()};
@@ -1278,8 +1336,7 @@ bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
SDValue Addr =
LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
- SDValue Base, Offset;
- SelectADDR(Addr, Base, Offset);
+ const auto [Base, Offset] = selectADDR(Addr, CurDAG);
SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
std::optional<unsigned> Opcode;
@@ -1339,9 +1396,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
"Invalid width for store");
- SDValue Offset, Base;
- SelectADDR(ST->getBasePtr(), Base, Offset);
-
+ const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
SDValue Ops[] = {selectPossiblyImm(Value),
getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
@@ -1399,9 +1454,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
TotalWidth <= 256 && "Invalid width for store");
- SDValue Offset, Base;
- SelectADDR(Addr, Base, Offset);
-
+ const auto [Base, Offset] = selectADDR(Addr, CurDAG);
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
Offset, Chain});
@@ -1708,59 +1761,6 @@ bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
return true;
}
-static inline bool isAddLike(const SDValue V) {
- return V.getOpcode() == ISD::ADD ||
- (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
-}
-
-// selectBaseADDR - Match a dag node which will serve as the base address for an
-// ADDR operand pair.
-static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
- if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
- return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
- GA->getValueType(0), GA->getOffset(),
- GA->getTargetFlags());
- if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
- return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
- ES->getTargetFlags());
- if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
- return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
-
- return N;
-}
-
-static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
- APInt AccumulatedOffset(64u, 0);
- while (isAddLike(Addr)) {
- const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
- if (!CN)
- break;
-
- const APInt CI = CN->getAPIntValue().sext(64);
- if (!(CI + AccumulatedOffset).isSignedIntN(32))
- break;
-
- AccumulatedOffset += CI;
- Addr = Addr->getOperand(0);
- }
- return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
- MVT::i32);
-}
-
-// Select a pair of operands which represent a valid PTX address, this could be
-// one of the following things:
-// - [var] - Offset is simply set to 0
-// - [reg] - Offset is simply set to 0
-// - [reg+immOff]
-// - [var+immOff]
-// Note that immOff must fit into a 32-bit signed integer.
-bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
- SDValue &Offset) {
- Offset = accumulateOffset(Addr, SDLoc(Addr), CurDAG);
- Base = selectBaseADDR(Addr, CurDAG);
- return true;
-}
-
SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
if (V.getOpcode() == ISD::BITCAST)
V = V.getOperand(0);
@@ -1774,37 +1774,20 @@ SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
return V;
}
-bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
- unsigned int spN) const {
- const Value *Src = nullptr;
- if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
- if (spN == 0 && mN->getMemOperand()->getPseudoValue())
- return true;
- Src = mN->getMemOperand()->getValue();
- }
- if (!Src)
- return false;
- if (auto *PT = dyn_cast<PointerType>(Src->getType()))
- return (PT->getAddressSpace() == spN);
- return false;
-}
-
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
std::vector<SDValue> &OutOps) {
- SDValue Op0, Op1;
switch (ConstraintID) {
default:
return true;
- case InlineAsm::ConstraintCode::m: // memory
- if (SelectADDR(Op, Op0, Op1)) {
- OutOps.push_back(Op0);
- OutOps.push_back(Op1);
- return false;
- }
- break;
+ case InlineAsm::ConstraintCode::m: { // memory
+ const auto [Base, Offset] = selectADDR(Op, CurDAG);
+ OutOps.push_back(Base);
+ OutOps.push_back(Offset);
+ return false;
+ }
}
return true;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 9e0f88e5..6573172 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool usePrecSqrtF32(const SDNode *N) const;
bool useF32FTZ() const;
bool allowFMA() const;
- bool allowUnsafeFPMath() const;
bool doRsqrtOpt() const;
NVPTXScopes Scopes{};
@@ -102,8 +101,6 @@ private:
SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
SDValue selectPossiblyImm(SDValue V);
- bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
-
// Returns the Memory Order and Scope that the PTX memory instruction should
// use, and inserts appropriate fence instruction before the memory
// instruction, if needed to implement the instructions memory order. Required
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d4f0cc9..74e6c13 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -125,10 +124,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
if (UsePrecDivF32.getNumOccurrences() > 0)
return UsePrecDivF32;
- // Otherwise, use div.approx if fast math is enabled
- if (allowUnsafeFPMath(MF))
- return NVPTX::DivPrecisionLevel::Approx;
-
const SDNodeFlags Flags = N.getFlags();
if (Flags.hasApproximateFuncs())
return NVPTX::DivPrecisionLevel::Approx;
@@ -136,16 +131,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
return NVPTX::DivPrecisionLevel::IEEE754;
}
-bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
- const SDNode *N) const {
+bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const {
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
if (UsePrecSqrtF32.getNumOccurrences() > 0)
return UsePrecSqrtF32;
- // Otherwise, use sqrt.approx if fast math is enabled
- if (allowUnsafeFPMath(MF))
- return false;
-
if (N) {
const SDNodeFlags Flags = N->getFlags();
if (Flags.hasApproximateFuncs())
@@ -680,6 +670,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// No support for these operations with v2f32.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
+ // Need custom lowering in case the index is dynamic.
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
@@ -1191,8 +1183,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
bool &UseOneConst,
bool Reciprocal) const {
if (!(Enabled == ReciprocalEstimate::Enabled ||
- (Enabled == ReciprocalEstimate::Unspecified &&
- !usePrecSqrtF32(DAG.getMachineFunction()))))
+ (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
return SDValue();
if (ExtraSteps == ReciprocalEstimate::Unspecified)
@@ -2849,8 +2840,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
SDLoc(Op), Opcode, DAG);
}
-static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
- bool AllowUnsafeFPMath) {
+static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) {
// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
// the semantics of LLVM's frem.
@@ -2867,7 +2857,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
Flags | SDNodeFlags::AllowContract);
- if (AllowUnsafeFPMath || Flags.hasNoInfs())
+ if (Flags.hasNoInfs())
return Sub;
// If Y is infinite, return X
@@ -3012,7 +3002,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTLZ:
return lowerCTLZCTPOP(Op, DAG);
case ISD::FREM:
- return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction()));
+ return lowerFREM(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
@@ -4866,17 +4856,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
return true;
- return allowUnsafeFPMath(MF);
-}
-
-bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
- // Honor TargetOptions flags that explicitly say unsafe math is okay.
- if (MF.getTarget().Options.UnsafeFPMath)
- return true;
-
- // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
- const Function &F = MF.getFunction();
- return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+ return false;
}
static bool isConstZero(const SDValue &Operand) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 43e721a..27f099e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -206,8 +206,7 @@ public:
// Get whether we should use a precise or approximate 32-bit floating point
// sqrt instruction.
- bool usePrecSqrtF32(const MachineFunction &MF,
- const SDNode *N = nullptr) const;
+ bool usePrecSqrtF32(const SDNode *N = nullptr) const;
// Get whether we should use instructions that flush floating-point denormals
// to sign-preserving zero.
@@ -220,7 +219,6 @@ public:
unsigned combineRepeatedFPDivisors() const override { return 2; }
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
- bool allowUnsafeFPMath(const MachineFunction &MF) const;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 1ab41bf..7b13509 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -148,13 +148,16 @@ class OneUse2<SDPatternOperator operator>
: PatFrag<(ops node:$A, node:$B), (operator node:$A, node:$B), [{ return N->hasOneUse(); }]>;
-class fpimm_pos_inf<ValueType vt>
- : FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>;
-
class zeroinitializer<ValueType vt> :
PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>;
+def fpimm_pos_inf : FPImmLeaf<fAny, [{ return Imm.isPosInfinity(); }]>;
+def fpimm_0 : FPImmLeaf<fAny, [{ return Imm.isZero(); }]>;
+def fpimm_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(1.0); }]>;
+def fpimm_neg_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(-1.0); }]>;
+
+
// Operands which can hold a Register or an Immediate.
//
// Unfortunately, since most register classes can hold multiple types, we must
@@ -268,7 +271,7 @@ multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
foreach t = [I16RT, I32RT, I64RT] in
- defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
+ defm t.Size# : I3Inst<op_str # t.Size, op_node, t, commutative>;
}
class I16x2<string OpcStr, SDNode OpNode> :
@@ -761,10 +764,10 @@ def fabs_oneuse : OneUse1<fabs>;
def TESTINF_f32r : BasicNVPTXInst<(outs B1:$p), (ins B32:$a),
"testp.infinite.f32",
- [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf<f32>))]>;
+ [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf))]>;
def TESTINF_f64r : BasicNVPTXInst<(outs B1:$p), (ins B64:$a),
"testp.infinite.f64",
- [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf<f64>))]>;
+ [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf))]>;
//-----------------------------------
// Integer Arithmetic
@@ -787,8 +790,8 @@ defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
defm MULT : I3<"mul.lo.s", mul, commutative = true>;
-defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
-defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;
+defm MUL_HI_S : I3<"mul.hi.s", mulhs, commutative = true>;
+defm MUL_HI_U : I3<"mul.hi.u", mulhu, commutative = true>;
defm SDIV : I3<"div.s", sdiv, commutative = false>;
defm UDIV : I3<"div.u", udiv, commutative = false>;
@@ -905,22 +908,6 @@ let Predicates = [hasOptEnabled] in {
// Floating Point Arithmetic
//-----------------------------------
-// Constant 1.0f
-def f32imm_1 : FPImmLeaf<f32, [{
- return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() &&
- Imm.convertToFloat() == 1.0f;
-}]>;
-// Constant 1.0 (double)
-def f64imm_1 : FPImmLeaf<f64, [{
- return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
- Imm.convertToDouble() == 1.0;
-}]>;
-// Constant -1.0 (double)
-def f64imm_neg1 : FPImmLeaf<f64, [{
- return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
- Imm.convertToDouble() == -1.0;
-}]>;
-
defm FADD : F3_fma_component<"add", fadd>;
defm FSUB : F3_fma_component<"sub", fsub>;
defm FMUL : F3_fma_component<"mul", fmul>;
@@ -994,7 +981,7 @@ def FRCP64r :
BasicNVPTXInst<(outs B64:$dst),
(ins B64:$b),
"rcp.rn.f64",
- [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>;
+ [(set f64:$dst, (fdiv fpimm_1, f64:$b))]>;
def FDIV64rr :
BasicNVPTXInst<(outs B64:$dst),
(ins B64:$a, B64:$b),
@@ -1008,7 +995,7 @@ def FDIV64ri :
// fdiv will be converted to rcp
// fneg (fdiv 1.0, X) => fneg (rcp.rn X)
-def : Pat<(fdiv f64imm_neg1, f64:$b),
+def : Pat<(fdiv fpimm_neg_1, f64:$b),
(FNEGf64 (FRCP64r $b))>;
//
@@ -1021,21 +1008,21 @@ def fdiv_approx : PatFrag<(ops node:$a, node:$b),
}]>;
-def FRCP32_approx_r :
+def RCP_APPROX_F32_r :
BasicFlagsNVPTXInst<(outs B32:$dst),
(ins B32:$b), (ins FTZFlag:$ftz),
"rcp.approx$ftz.f32",
- [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>;
+ [(set f32:$dst, (fdiv_approx fpimm_1, f32:$b))]>;
//
// F32 Approximate division
//
-def FDIV32_approx_rr :
+def DIV_APPROX_F32_rr :
BasicFlagsNVPTXInst<(outs B32:$dst),
(ins B32:$a, B32:$b), (ins FTZFlag:$ftz),
"div.approx$ftz.f32",
[(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>;
-def FDIV32_approx_ri :
+def DIV_APPROX_F32_ri :
BasicFlagsNVPTXInst<(outs B32:$dst),
(ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz),
"div.approx$ftz.f32",
@@ -1052,8 +1039,8 @@ def fdiv_full : PatFrag<(ops node:$a, node:$b),
}]>;
-def : Pat<(fdiv_full f32imm_1, f32:$b),
- (FRCP32_approx_r $b)>;
+def : Pat<(fdiv_full fpimm_1, f32:$b),
+ (RCP_APPROX_F32_r $b)>;
//
// F32 Semi-accurate division
@@ -1081,7 +1068,7 @@ def FRCP32r_prec :
BasicFlagsNVPTXInst<(outs B32:$dst),
(ins B32:$b), (ins FTZFlag:$ftz),
"rcp.rn$ftz.f32",
- [(set f32:$dst, (fdiv_ftz f32imm_1, f32:$b))]>;
+ [(set f32:$dst, (fdiv_ftz fpimm_1, f32:$b))]>;
//
// F32 Accurate division
//
@@ -1096,7 +1083,7 @@ def FDIV32ri_prec :
"div.rn$ftz.f32",
[(set f32:$dst, (fdiv_ftz f32:$a, fpimm:$b))]>;
-def : Pat<(fdiv f32imm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>;
+def : Pat<(fdiv fpimm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>;
def : Pat<(fdiv f32:$a, f32:$b), (FDIV32rr_prec $a, $b, NoFTZ)>;
def : Pat<(fdiv f32:$a, fpimm:$b), (FDIV32ri_prec $a, fpimm:$b, NoFTZ)>;
@@ -1146,9 +1133,8 @@ defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
- : PatFrag<(ops node:$A),
- (operator node:$A), [{
- return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs();
+ : PatFrag<(ops node:$A), (operator node:$A), [{
+ return N->getFlags().hasApproximateFuncs();
}]>;
def SIN_APPROX_f32 :
@@ -1519,23 +1505,28 @@ def MmaCode : Operand<i32> {
// Get pointer to local stack.
let hasSideEffects = false in {
def MOV_DEPOT_ADDR : NVPTXInst<(outs B32:$d), (ins i32imm:$num),
- "mov.b32 \t$d, __local_depot$num;", []>;
+ "mov.b32 \t$d, __local_depot$num;">;
def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs B64:$d), (ins i32imm:$num),
- "mov.b64 \t$d, __local_depot$num;", []>;
+ "mov.b64 \t$d, __local_depot$num;">;
}
-
-// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let hasSideEffects = false, isAsCheapAsAMove = true in {
- let isMoveReg = true in
+let hasSideEffects = false in {
+ let isMoveReg = true, isAsCheapAsAMove = true in
class MOVr<RegisterClass RC, string OpStr> :
BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>;
- let isMoveImm = true in
+ let isMoveImm = true, isAsCheapAsAMove = true in
class MOVi<RegTyInfo t, string suffix> :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src),
"mov." # suffix,
[(set t.Ty:$dst, t.ImmNode:$src)]>;
+
+ // We don't want to set isAsCheapAsAMove to true for these instructions as
+ // this would prevent CSE and resulted in regressions (see discussion after
+ // PR-145581 in llvm-project).
+ class MovSymInst<RegTyInfo t> :
+ BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src),
+ "mov.b" # t.Size>;
}
def MOV_B1_r : MOVr<B1, "pred">;
@@ -1553,6 +1544,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">;
def MOV_F32_i : MOVi<F32RT, "b32">;
def MOV_F64_i : MOVi<F64RT, "b64">;
+def MOV_B32_sym : MovSymInst<I32RT>;
+def MOV_B64_sym : MovSymInst<I64RT>;
+
def to_tglobaladdr : SDNodeXForm<globaladdr, [{
return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
@@ -1569,17 +1563,17 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
}]>;
-def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>;
-def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>;
+def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>;
+def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>;
-def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>;
-def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>;
+def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>;
+def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>;
//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
- "add.u32 \t$dst, ${addr:add};", []>;
+ "add.u32 \t$dst, ${addr:add};">;
def LEA_ADDRi64 : NVPTXInst<(outs B64:$dst), (ins ADDR:$addr),
- "add.u64 \t$dst, ${addr:add};", []>;
+ "add.u64 \t$dst, ${addr:add};">;
def : Pat<(i32 frameindex:$fi), (LEA_ADDRi (to_tframeindex $fi), 0)>;
def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
@@ -1644,12 +1638,12 @@ foreach is_convergent = [0, 1] in {
NVPTXInst<(outs),
(ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params,
i32imm:$proto),
- "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;", []>;
+ "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;">;
def CALL_UNI # convergent_suffix :
NVPTXInst<(outs),
(ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params),
- "call.uni${rets:RetList} $addr, (${params:ParamList});", []>;
+ "call.uni${rets:RetList} $addr, (${params:ParamList});">;
}
defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix);
@@ -1665,10 +1659,10 @@ foreach is_convergent = [0, 1] in {
def DECLARE_PARAM_array :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
- ".param .align $align .b8 \t$a[$size];", []>;
+ ".param .align $align .b8 \t$a[$size];">;
def DECLARE_PARAM_scalar :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
- ".param .b$size \t$a;", []>;
+ ".param .b$size \t$a;">;
def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size),
(DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>;
@@ -1741,7 +1735,7 @@ class LD<NVPTXRegClass regclass>
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr];", []>;
+ "\t$dst, [$addr];">;
let mayLoad=1, hasSideEffects=0 in {
def LD_i16 : LD<B16>;
@@ -1756,7 +1750,7 @@ class ST<DAGOperand O>
AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
- " \t[$addr], $src;", []>;
+ " \t[$addr], $src;">;
let mayStore=1, hasSideEffects=0 in {
def ST_i16 : ST<RI16>;
@@ -1773,13 +1767,13 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr];", []>;
+ "\t{{$dst1, $dst2}}, [$addr];">;
def _v4 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];">;
if support_v8 then
def _v8 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
@@ -1788,7 +1782,7 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
- "[$addr];", []>;
+ "[$addr];">;
}
let mayLoad=1, hasSideEffects=0 in {
defm LDV_i16 : LD_VEC<B16>;
@@ -1803,14 +1797,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
- "\t[$addr], {{$src1, $src2}};", []>;
+ "\t[$addr], {{$src1, $src2}};">;
def _v4 : NVPTXInst<
(outs),
(ins O:$src1, O:$src2, O:$src3, O:$src4,
AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+ "\t[$addr], {{$src1, $src2, $src3, $src4}};">;
if support_v8 then
def _v8 : NVPTXInst<
(outs),
@@ -1820,7 +1814,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
"\t[$addr], "
- "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>;
+ "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};">;
}
let mayStore=1, hasSideEffects=0 in {
@@ -2015,60 +2009,52 @@ let hasSideEffects = false in {
def V4I16toI64 : NVPTXInst<(outs B64:$d),
(ins B16:$s1, B16:$s2,
B16:$s3, B16:$s4),
- "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+ "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};">;
def V2I16toI32 : NVPTXInst<(outs B32:$d),
(ins B16:$s1, B16:$s2),
- "mov.b32 \t$d, {{$s1, $s2}};", []>;
+ "mov.b32 \t$d, {{$s1, $s2}};">;
def V2I32toI64 : NVPTXInst<(outs B64:$d),
(ins B32:$s1, B32:$s2),
- "mov.b64 \t$d, {{$s1, $s2}};", []>;
+ "mov.b64 \t$d, {{$s1, $s2}};">;
def V2I64toI128 : NVPTXInst<(outs B128:$d),
(ins B64:$s1, B64:$s2),
- "mov.b128 \t$d, {{$s1, $s2}};", []>;
+ "mov.b128 \t$d, {{$s1, $s2}};">;
// unpack a larger int register to a set of smaller int registers
def I64toV4I16 : NVPTXInst<(outs B16:$d1, B16:$d2,
B16:$d3, B16:$d4),
(ins B64:$s),
- "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+ "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;">;
def I32toV2I16 : NVPTXInst<(outs B16:$d1, B16:$d2),
(ins B32:$s),
- "mov.b32 \t{{$d1, $d2}}, $s;", []>;
+ "mov.b32 \t{{$d1, $d2}}, $s;">;
def I64toV2I32 : NVPTXInst<(outs B32:$d1, B32:$d2),
(ins B64:$s),
- "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+ "mov.b64 \t{{$d1, $d2}}, $s;">;
def I128toV2I64: NVPTXInst<(outs B64:$d1, B64:$d2),
(ins B128:$s),
- "mov.b128 \t{{$d1, $d2}}, $s;", []>;
+ "mov.b128 \t{{$d1, $d2}}, $s;">;
- def I32toI16H : NVPTXInst<(outs B16:$high),
- (ins B32:$s),
- "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
- []>;
- def I32toI16L : NVPTXInst<(outs B16:$low),
- (ins B32:$s),
- "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
- []>;
- def I64toI32H : NVPTXInst<(outs B32:$high),
- (ins B64:$s),
- "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
- []>;
- def I64toI32L : NVPTXInst<(outs B32:$low),
- (ins B64:$s),
- "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
- []>;
+ def I32toI16H : NVPTXInst<(outs B16:$high), (ins B32:$s),
+ "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}">;
+ def I32toI16L : NVPTXInst<(outs B16:$low), (ins B32:$s),
+ "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}">;
+ def I64toI32H : NVPTXInst<(outs B32:$high), (ins B64:$s),
+ "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}">;
+ def I64toI32L : NVPTXInst<(outs B32:$low), (ins B64:$s),
+ "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}">;
// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
// unused high/low part.
let Predicates = [hasPTX<71>] in {
def I32toI16H_Sink : NVPTXInst<(outs B16:$high), (ins B32:$s),
- "mov.b32 \t{{_, $high}}, $s;", []>;
+ "mov.b32 \t{{_, $high}}, $s;">;
def I32toI16L_Sink : NVPTXInst<(outs B16:$low), (ins B32:$s),
- "mov.b32 \t{{$low, _}}, $s;", []>;
+ "mov.b32 \t{{$low, _}}, $s;">;
def I64toI32H_Sink : NVPTXInst<(outs B32:$high), (ins B64:$s),
- "mov.b64 \t{{_, $high}}, $s;", []>;
+ "mov.b64 \t{{_, $high}}, $s;">;
def I64toI32L_Sink : NVPTXInst<(outs B32:$low), (ins B64:$s),
- "mov.b64 \t{{$low, _}}, $s;", []>;
+ "mov.b64 \t{{$low, _}}, $s;">;
}
}
@@ -2426,10 +2412,6 @@ foreach scope = ["sys", "gpu", "cluster", "cta"] in {
def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
}
-def fpimm_any_zero : FPImmLeaf<fAny, [{
- return Imm.isZero();
-}]>;
-
// Perform substitution if fma only has one use, and also if instruction has
// nnan instruction flag or if the TM has NoNaNsFPMath
def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
@@ -2451,11 +2433,11 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat>
[(set t.Ty:$dst, (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan t.Ty:$a, t.Ty:$b, t.Ty:$c), zero_pat))]>;
let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in {
- def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>;
+ def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_0>;
def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>;
}
let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in {
- def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>;
+ def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_0>;
def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d4a0ca7..721afae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6,44 +6,24 @@
//
//===----------------------------------------------------------------------===//
-def immFloat0 : PatLeaf<(fpimm), [{
- float f = (float)N->getValueAPF().convertToFloat();
- return (f==0.0f);
-}]>;
-
-def immFloat1 : PatLeaf<(fpimm), [{
- float f = (float)N->getValueAPF().convertToFloat();
- return (f==1.0f);
-}]>;
-
-def immDouble0 : PatLeaf<(fpimm), [{
- double d = (double)N->getValueAPF().convertToDouble();
- return (d==0.0);
-}]>;
-
-def immDouble1 : PatLeaf<(fpimm), [{
- double d = (double)N->getValueAPF().convertToDouble();
- return (d==1.0);
-}]>;
-
def AS_match {
code generic = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC;
}];
code shared = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED;
}];
code shared_cluster = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED_CLUSTER);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED_CLUSTER;
}];
code global = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL;
}];
code const = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_CONST;
}];
code param = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+ return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_PARAM;
}];
}
@@ -659,22 +639,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
def "" : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
+ !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _MC : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B16:$mc, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
+ !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _CH : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B64:$ch, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
+ !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _MC_CH : NVPTXInst<(outs),
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
(ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
- !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
+ !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
}
@@ -876,11 +856,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode>
def "" : NVPTXInst<(outs),
!con((ins rc:$src, B64:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
- !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>,
+ !strconcat(prefix, "${red_op}", suffix, asm_str, ";")>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _CH : NVPTXInst<(outs),
!con((ins rc:$src, B64:$tmap), dims_dag, (ins B64:$ch, TMAReductionFlags:$red_op)),
- !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>,
+ !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;")>,
Requires<[hasPTX<80>, hasSM<90>]>;
}
@@ -1112,30 +1092,30 @@ let Predicates = [hasPTX<70>, hasSM<80>] in {
// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
// Same story for fmax, fmin.
-def : Pat<(int_nvvm_fmin_f immFloat1,
- (int_nvvm_fmax_f immFloat0, f32:$a)),
+def : Pat<(int_nvvm_fmin_f fpimm_1,
+ (int_nvvm_fmax_f fpimm_0, f32:$a)),
(CVT_f32_f32 $a, CvtSAT)>;
-def : Pat<(int_nvvm_fmin_f immFloat1,
- (int_nvvm_fmax_f f32:$a, immFloat0)),
+def : Pat<(int_nvvm_fmin_f fpimm_1,
+ (int_nvvm_fmax_f f32:$a, fpimm_0)),
(CVT_f32_f32 $a, CvtSAT)>;
def : Pat<(int_nvvm_fmin_f
- (int_nvvm_fmax_f immFloat0, f32:$a), immFloat1),
+ (int_nvvm_fmax_f fpimm_0, f32:$a), fpimm_1),
(CVT_f32_f32 $a, CvtSAT)>;
def : Pat<(int_nvvm_fmin_f
- (int_nvvm_fmax_f f32:$a, immFloat0), immFloat1),
+ (int_nvvm_fmax_f f32:$a, fpimm_0), fpimm_1),
(CVT_f32_f32 $a, CvtSAT)>;
-def : Pat<(int_nvvm_fmin_d immDouble1,
- (int_nvvm_fmax_d immDouble0, f64:$a)),
+def : Pat<(int_nvvm_fmin_d fpimm_1,
+ (int_nvvm_fmax_d fpimm_0, f64:$a)),
(CVT_f64_f64 $a, CvtSAT)>;
-def : Pat<(int_nvvm_fmin_d immDouble1,
- (int_nvvm_fmax_d f64:$a, immDouble0)),
+def : Pat<(int_nvvm_fmin_d fpimm_1,
+ (int_nvvm_fmax_d f64:$a, fpimm_0)),
(CVT_f64_f64 $a, CvtSAT)>;
def : Pat<(int_nvvm_fmin_d
- (int_nvvm_fmax_d immDouble0, f64:$a), immDouble1),
+ (int_nvvm_fmax_d fpimm_0, f64:$a), fpimm_1),
(CVT_f64_f64 $a, CvtSAT)>;
def : Pat<(int_nvvm_fmin_d
- (int_nvvm_fmax_d f64:$a, immDouble0), immDouble1),
+ (int_nvvm_fmax_d f64:$a, fpimm_0), fpimm_1),
(CVT_f64_f64 $a, CvtSAT)>;
@@ -1329,12 +1309,12 @@ defm INT_NVVM_FMAN : MIN_MAX<"max">;
// Multiplication
//
-def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16", B16, B16, B16, int_nvvm_mulhi_s>;
-def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16", B16, B16, B16, int_nvvm_mulhi_us>;
-def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32", B32, B32, B32, int_nvvm_mulhi_i>;
-def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32", B32, B32, B32, int_nvvm_mulhi_ui>;
-def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64", B64, B64, B64, int_nvvm_mulhi_ll>;
-def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64", B64, B64, B64, int_nvvm_mulhi_ull>;
+def : Pat<(int_nvvm_mulhi_s i16:$a, i16:$b), (MUL_HI_S16rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_us i16:$a, i16:$b), (MUL_HI_U16rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_i i32:$a, i32:$b), (MUL_HI_S32rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ui i32:$a, i32:$b), (MUL_HI_U32rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ll i64:$a, i64:$b), (MUL_HI_S64rr $a, $b)>;
+def : Pat<(int_nvvm_mulhi_ull i64:$a, i64:$b), (MUL_HI_U64rr $a, $b)>;
def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32", B32, B32, B32, int_nvvm_mul_rn_ftz_f>;
def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32", B32, B32, B32, int_nvvm_mul_rn_f>;
@@ -1357,8 +1337,8 @@ def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32", B32, B32, B32, int_nvvm_mul24_u
// Div
//
-def INT_NVVM_DIV_APPROX_FTZ_F : F_MATH_2<"div.approx.ftz.f32", B32, B32, B32, int_nvvm_div_approx_ftz_f>;
-def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32", B32, B32, B32, int_nvvm_div_approx_f>;
+def : Pat<(int_nvvm_div_approx_ftz_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, FTZ)>;
+def : Pat<(int_nvvm_div_approx_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, NoFTZ)>;
def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32", B32, B32, B32, int_nvvm_div_rn_ftz_f>;
def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32", B32, B32, B32, int_nvvm_div_rn_f>;
@@ -1663,13 +1643,13 @@ def : Pat<(int_nvvm_rsqrt_approx_d f64:$a), (RSQRT_APPROX_f64 $a, NoFTZ)>;
// 1.0f / sqrt_approx -> rsqrt_approx
let Predicates = [doRsqrtOpt] in {
- def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
+ def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_f f32:$a)),
(RSQRT_APPROX_f32 $a, NoFTZ)>;
- def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
+ def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
(RSQRT_APPROX_f32 $a, FTZ)>;
// same for int_nvvm_sqrt_f when non-precision sqrt is requested
- def : Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
+ def : Pat<(fdiv fpimm_1, (fsqrt_approx f32:$a)),
(RSQRT_APPROX_f32 $a)>;
}
//
@@ -2231,7 +2211,7 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
class LDU_G<NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins i32imm:$fromWidth, ADDR:$src),
- "ldu.global.b$fromWidth \t$result, [$src];", []>;
+ "ldu.global.b$fromWidth \t$result, [$src];">;
def LDU_GLOBAL_i16 : LDU_G<B16>;
def LDU_GLOBAL_i32 : LDU_G<B32>;
@@ -2243,13 +2223,13 @@ def LDU_GLOBAL_i64 : LDU_G<B64>;
class VLDU_G_ELE_V2<NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins i32imm:$fromWidth, ADDR:$src),
- "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
+ "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];">;
class VLDU_G_ELE_V4<NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins i32imm:$fromWidth, ADDR:$src),
- "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">;
def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<B16>;
@@ -2270,9 +2250,8 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<B32>;
class LDG_G<NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
- "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
+ "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];">;
-def LD_GLOBAL_NC_i8 : LDG_G<B16>;
def LD_GLOBAL_NC_i16 : LDG_G<B16>;
def LD_GLOBAL_NC_i32 : LDG_G<B32>;
def LD_GLOBAL_NC_i64 : LDG_G<B64>;
@@ -2283,19 +2262,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>;
class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
- "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
+ "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];">;
class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
- "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">;
class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
(ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
- "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+ "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];">;
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<B16>;
@@ -3540,20 +3519,13 @@ multiclass SULD_1D<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_I8_CLAMP : SULD_1D<"suld.b.1d.b8.clamp", B16>;
-defm SULD_1D_I16_CLAMP : SULD_1D<"suld.b.1d.b16.clamp", B16>;
-defm SULD_1D_I32_CLAMP : SULD_1D<"suld.b.1d.b32.clamp", B32>;
-defm SULD_1D_I64_CLAMP : SULD_1D<"suld.b.1d.b64.clamp", B64>;
-
-defm SULD_1D_I8_TRAP : SULD_1D<"suld.b.1d.b8.trap", B16>;
-defm SULD_1D_I16_TRAP : SULD_1D<"suld.b.1d.b16.trap", B16>;
-defm SULD_1D_I32_TRAP : SULD_1D<"suld.b.1d.b32.trap", B32>;
-defm SULD_1D_I64_TRAP : SULD_1D<"suld.b.1d.b64.trap", B64>;
-
-defm SULD_1D_I8_ZERO : SULD_1D<"suld.b.1d.b8.zero", B16>;
-defm SULD_1D_I16_ZERO : SULD_1D<"suld.b.1d.b16.zero", B16>;
-defm SULD_1D_I32_ZERO : SULD_1D<"suld.b.1d.b32.zero", B32>;
-defm SULD_1D_I64_ZERO : SULD_1D<"suld.b.1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_I8_ # op_upper : SULD_1D<"suld.b.1d.b8." # op, B16>;
+ defm SULD_1D_I16_ # op_upper : SULD_1D<"suld.b.1d.b16." # op, B16>;
+ defm SULD_1D_I32_ # op_upper : SULD_1D<"suld.b.1d.b32." # op, B32>;
+ defm SULD_1D_I64_ # op_upper : SULD_1D<"suld.b.1d.b64." # op, B64>;
+}
class SULD_1D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3570,20 +3542,13 @@ multiclass SULD_1D_ARRAY<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_ARRAY_I8_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b8.clamp", B16>;
-defm SULD_1D_ARRAY_I16_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b16.clamp", B16>;
-defm SULD_1D_ARRAY_I32_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b32.clamp", B32>;
-defm SULD_1D_ARRAY_I64_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b64.clamp", B64>;
-
-defm SULD_1D_ARRAY_I8_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b8.trap", B16>;
-defm SULD_1D_ARRAY_I16_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b16.trap", B16>;
-defm SULD_1D_ARRAY_I32_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b32.trap", B32>;
-defm SULD_1D_ARRAY_I64_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b64.trap", B64>;
-
-defm SULD_1D_ARRAY_I8_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b8.zero", B16>;
-defm SULD_1D_ARRAY_I16_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b16.zero", B16>;
-defm SULD_1D_ARRAY_I32_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b32.zero", B32>;
-defm SULD_1D_ARRAY_I64_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_ARRAY_I8_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b8." # op, B16>;
+ defm SULD_1D_ARRAY_I16_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b16." # op, B16>;
+ defm SULD_1D_ARRAY_I32_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b32." # op, B32>;
+ defm SULD_1D_ARRAY_I64_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b64." # op, B64>;
+}
class SULD_2D_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3599,20 +3564,13 @@ multiclass SULD_2D<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_I8_CLAMP : SULD_2D<"suld.b.2d.b8.clamp", B16>;
-defm SULD_2D_I16_CLAMP : SULD_2D<"suld.b.2d.b16.clamp", B16>;
-defm SULD_2D_I32_CLAMP : SULD_2D<"suld.b.2d.b32.clamp", B32>;
-defm SULD_2D_I64_CLAMP : SULD_2D<"suld.b.2d.b64.clamp", B64>;
-
-defm SULD_2D_I8_TRAP : SULD_2D<"suld.b.2d.b8.trap", B16>;
-defm SULD_2D_I16_TRAP : SULD_2D<"suld.b.2d.b16.trap", B16>;
-defm SULD_2D_I32_TRAP : SULD_2D<"suld.b.2d.b32.trap", B32>;
-defm SULD_2D_I64_TRAP : SULD_2D<"suld.b.2d.b64.trap", B64>;
-
-defm SULD_2D_I8_ZERO : SULD_2D<"suld.b.2d.b8.zero", B16>;
-defm SULD_2D_I16_ZERO : SULD_2D<"suld.b.2d.b16.zero", B16>;
-defm SULD_2D_I32_ZERO : SULD_2D<"suld.b.2d.b32.zero", B32>;
-defm SULD_2D_I64_ZERO : SULD_2D<"suld.b.2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_I8_ # op_upper : SULD_2D<"suld.b.2d.b8." # op, B16>;
+ defm SULD_2D_I16_ # op_upper : SULD_2D<"suld.b.2d.b16." # op, B16>;
+ defm SULD_2D_I32_ # op_upper : SULD_2D<"suld.b.2d.b32." # op, B32>;
+ defm SULD_2D_I64_ # op_upper : SULD_2D<"suld.b.2d.b64." # op, B64>;
+}
class SULD_2D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3629,20 +3587,13 @@ multiclass SULD_2D_ARRAY<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_ARRAY_I8_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b8.clamp", B16>;
-defm SULD_2D_ARRAY_I16_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b16.clamp", B16>;
-defm SULD_2D_ARRAY_I32_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b32.clamp", B32>;
-defm SULD_2D_ARRAY_I64_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b64.clamp", B64>;
-
-defm SULD_2D_ARRAY_I8_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b8.trap", B16>;
-defm SULD_2D_ARRAY_I16_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b16.trap", B16>;
-defm SULD_2D_ARRAY_I32_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b32.trap", B32>;
-defm SULD_2D_ARRAY_I64_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b64.trap", B64>;
-
-defm SULD_2D_ARRAY_I8_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b8.zero", B16>;
-defm SULD_2D_ARRAY_I16_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b16.zero", B16>;
-defm SULD_2D_ARRAY_I32_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b32.zero", B32>;
-defm SULD_2D_ARRAY_I64_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_ARRAY_I8_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b8." # op, B16>;
+ defm SULD_2D_ARRAY_I16_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b16." # op, B16>;
+ defm SULD_2D_ARRAY_I32_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b32." # op, B32>;
+ defm SULD_2D_ARRAY_I64_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b64." # op, B64>;
+}
class SULD_3D_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3659,20 +3610,13 @@ multiclass SULD_3D<string inst, NVPTXRegClass outtype> {
def _I : SULD_3D_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_3D_I8_CLAMP : SULD_3D<"suld.b.3d.b8.clamp", B16>;
-defm SULD_3D_I16_CLAMP : SULD_3D<"suld.b.3d.b16.clamp", B16>;
-defm SULD_3D_I32_CLAMP : SULD_3D<"suld.b.3d.b32.clamp", B32>;
-defm SULD_3D_I64_CLAMP : SULD_3D<"suld.b.3d.b64.clamp", B64>;
-
-defm SULD_3D_I8_TRAP : SULD_3D<"suld.b.3d.b8.trap", B16>;
-defm SULD_3D_I16_TRAP : SULD_3D<"suld.b.3d.b16.trap", B16>;
-defm SULD_3D_I32_TRAP : SULD_3D<"suld.b.3d.b32.trap", B32>;
-defm SULD_3D_I64_TRAP : SULD_3D<"suld.b.3d.b64.trap", B64>;
-
-defm SULD_3D_I8_ZERO : SULD_3D<"suld.b.3d.b8.zero", B16>;
-defm SULD_3D_I16_ZERO : SULD_3D<"suld.b.3d.b16.zero", B16>;
-defm SULD_3D_I32_ZERO : SULD_3D<"suld.b.3d.b32.zero", B32>;
-defm SULD_3D_I64_ZERO : SULD_3D<"suld.b.3d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_3D_I8_ # op_upper : SULD_3D<"suld.b.3d.b8." # op, B16>;
+ defm SULD_3D_I16_ # op_upper : SULD_3D<"suld.b.3d.b16." # op, B16>;
+ defm SULD_3D_I32_ # op_upper : SULD_3D<"suld.b.3d.b32." # op, B32>;
+ defm SULD_3D_I64_ # op_upper : SULD_3D<"suld.b.3d.b64." # op, B64>;
+}
}
let IsSuld = 2 in {
@@ -3692,20 +3636,13 @@ multiclass SULD_1D_V2<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_V2_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_V2I8_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b8.clamp", B16>;
-defm SULD_1D_V2I16_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b16.clamp", B16>;
-defm SULD_1D_V2I32_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b32.clamp", B32>;
-defm SULD_1D_V2I64_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b64.clamp", B64>;
-
-defm SULD_1D_V2I8_TRAP : SULD_1D_V2<"suld.b.1d.v2.b8.trap", B16>;
-defm SULD_1D_V2I16_TRAP : SULD_1D_V2<"suld.b.1d.v2.b16.trap", B16>;
-defm SULD_1D_V2I32_TRAP : SULD_1D_V2<"suld.b.1d.v2.b32.trap", B32>;
-defm SULD_1D_V2I64_TRAP : SULD_1D_V2<"suld.b.1d.v2.b64.trap", B64>;
-
-defm SULD_1D_V2I8_ZERO : SULD_1D_V2<"suld.b.1d.v2.b8.zero", B16>;
-defm SULD_1D_V2I16_ZERO : SULD_1D_V2<"suld.b.1d.v2.b16.zero", B16>;
-defm SULD_1D_V2I32_ZERO : SULD_1D_V2<"suld.b.1d.v2.b32.zero", B32>;
-defm SULD_1D_V2I64_ZERO : SULD_1D_V2<"suld.b.1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_V2I8_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b8." # op, B16>;
+ defm SULD_1D_V2I16_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b16." # op, B16>;
+ defm SULD_1D_V2I32_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b32." # op, B32>;
+ defm SULD_1D_V2I64_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b64." # op, B64>;
+}
class SULD_1D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3722,20 +3659,13 @@ multiclass SULD_1D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_ARRAY_V2I8_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.clamp", B16>;
-defm SULD_1D_ARRAY_V2I16_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.clamp", B16>;
-defm SULD_1D_ARRAY_V2I32_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.clamp", B32>;
-defm SULD_1D_ARRAY_V2I64_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.clamp", B64>;
-
-defm SULD_1D_ARRAY_V2I8_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.trap", B16>;
-defm SULD_1D_ARRAY_V2I16_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.trap", B16>;
-defm SULD_1D_ARRAY_V2I32_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.trap", B32>;
-defm SULD_1D_ARRAY_V2I64_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.trap", B64>;
-
-defm SULD_1D_ARRAY_V2I8_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.zero", B16>;
-defm SULD_1D_ARRAY_V2I16_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.zero", B16>;
-defm SULD_1D_ARRAY_V2I32_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.zero", B32>;
-defm SULD_1D_ARRAY_V2I64_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_ARRAY_V2I8_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8." # op, B16>;
+ defm SULD_1D_ARRAY_V2I16_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16." # op, B16>;
+ defm SULD_1D_ARRAY_V2I32_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32." # op, B32>;
+ defm SULD_1D_ARRAY_V2I64_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64." # op, B64>;
+}
class SULD_2D_V2_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3752,20 +3682,13 @@ multiclass SULD_2D_V2<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_V2_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_V2I8_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b8.clamp", B16>;
-defm SULD_2D_V2I16_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b16.clamp", B16>;
-defm SULD_2D_V2I32_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b32.clamp", B32>;
-defm SULD_2D_V2I64_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b64.clamp", B64>;
-
-defm SULD_2D_V2I8_TRAP : SULD_2D_V2<"suld.b.2d.v2.b8.trap", B16>;
-defm SULD_2D_V2I16_TRAP : SULD_2D_V2<"suld.b.2d.v2.b16.trap", B16>;
-defm SULD_2D_V2I32_TRAP : SULD_2D_V2<"suld.b.2d.v2.b32.trap", B32>;
-defm SULD_2D_V2I64_TRAP : SULD_2D_V2<"suld.b.2d.v2.b64.trap", B64>;
-
-defm SULD_2D_V2I8_ZERO : SULD_2D_V2<"suld.b.2d.v2.b8.zero", B16>;
-defm SULD_2D_V2I16_ZERO : SULD_2D_V2<"suld.b.2d.v2.b16.zero", B16>;
-defm SULD_2D_V2I32_ZERO : SULD_2D_V2<"suld.b.2d.v2.b32.zero", B32>;
-defm SULD_2D_V2I64_ZERO : SULD_2D_V2<"suld.b.2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_V2I8_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b8." # op, B16>;
+ defm SULD_2D_V2I16_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b16." # op, B16>;
+ defm SULD_2D_V2I32_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b32." # op, B32>;
+ defm SULD_2D_V2I64_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b64." # op, B64>;
+}
class SULD_2D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3782,20 +3705,13 @@ multiclass SULD_2D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_ARRAY_V2I8_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.clamp", B16>;
-defm SULD_2D_ARRAY_V2I16_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.clamp", B16>;
-defm SULD_2D_ARRAY_V2I32_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.clamp", B32>;
-defm SULD_2D_ARRAY_V2I64_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.clamp", B64>;
-
-defm SULD_2D_ARRAY_V2I8_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.trap", B16>;
-defm SULD_2D_ARRAY_V2I16_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.trap", B16>;
-defm SULD_2D_ARRAY_V2I32_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.trap", B32>;
-defm SULD_2D_ARRAY_V2I64_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.trap", B64>;
-
-defm SULD_2D_ARRAY_V2I8_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.zero", B16>;
-defm SULD_2D_ARRAY_V2I16_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.zero", B16>;
-defm SULD_2D_ARRAY_V2I32_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.zero", B32>;
-defm SULD_2D_ARRAY_V2I64_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_ARRAY_V2I8_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8." # op, B16>;
+ defm SULD_2D_ARRAY_V2I16_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16." # op, B16>;
+ defm SULD_2D_ARRAY_V2I32_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32." # op, B32>;
+ defm SULD_2D_ARRAY_V2I64_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64." # op, B64>;
+}
class SULD_3D_V2_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3812,20 +3728,13 @@ multiclass SULD_3D_V2<string inst, NVPTXRegClass outtype> {
def _I : SULD_3D_V2_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_3D_V2I8_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b8.clamp", B16>;
-defm SULD_3D_V2I16_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b16.clamp", B16>;
-defm SULD_3D_V2I32_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b32.clamp", B32>;
-defm SULD_3D_V2I64_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b64.clamp", B64>;
-
-defm SULD_3D_V2I8_TRAP : SULD_3D_V2<"suld.b.3d.v2.b8.trap", B16>;
-defm SULD_3D_V2I16_TRAP : SULD_3D_V2<"suld.b.3d.v2.b16.trap", B16>;
-defm SULD_3D_V2I32_TRAP : SULD_3D_V2<"suld.b.3d.v2.b32.trap", B32>;
-defm SULD_3D_V2I64_TRAP : SULD_3D_V2<"suld.b.3d.v2.b64.trap", B64>;
-
-defm SULD_3D_V2I8_ZERO : SULD_3D_V2<"suld.b.3d.v2.b8.zero", B16>;
-defm SULD_3D_V2I16_ZERO : SULD_3D_V2<"suld.b.3d.v2.b16.zero", B16>;
-defm SULD_3D_V2I32_ZERO : SULD_3D_V2<"suld.b.3d.v2.b32.zero", B32>;
-defm SULD_3D_V2I64_ZERO : SULD_3D_V2<"suld.b.3d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_3D_V2I8_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b8." # op, B16>;
+ defm SULD_3D_V2I16_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b16." # op, B16>;
+ defm SULD_3D_V2I32_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b32." # op, B32>;
+ defm SULD_3D_V2I64_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b64." # op, B64>;
+}
}
@@ -3846,17 +3755,12 @@ multiclass SULD_1D_V4<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_V4_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_V4I8_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b8.clamp", B16>;
-defm SULD_1D_V4I16_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b16.clamp", B16>;
-defm SULD_1D_V4I32_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b32.clamp", B32>;
-
-defm SULD_1D_V4I8_TRAP : SULD_1D_V4<"suld.b.1d.v4.b8.trap", B16>;
-defm SULD_1D_V4I16_TRAP : SULD_1D_V4<"suld.b.1d.v4.b16.trap", B16>;
-defm SULD_1D_V4I32_TRAP : SULD_1D_V4<"suld.b.1d.v4.b32.trap", B32>;
-
-defm SULD_1D_V4I8_ZERO : SULD_1D_V4<"suld.b.1d.v4.b8.zero", B16>;
-defm SULD_1D_V4I16_ZERO : SULD_1D_V4<"suld.b.1d.v4.b16.zero", B16>;
-defm SULD_1D_V4I32_ZERO : SULD_1D_V4<"suld.b.1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_V4I8_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b8." # op, B16>;
+ defm SULD_1D_V4I16_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b16." # op, B16>;
+ defm SULD_1D_V4I32_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b32." # op, B32>;
+}
class SULD_1D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3874,17 +3778,12 @@ multiclass SULD_1D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
def _I : SULD_1D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_1D_ARRAY_V4I8_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.clamp", B16>;
-defm SULD_1D_ARRAY_V4I16_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.clamp", B16>;
-defm SULD_1D_ARRAY_V4I32_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.clamp", B32>;
-
-defm SULD_1D_ARRAY_V4I8_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.trap", B16>;
-defm SULD_1D_ARRAY_V4I16_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.trap", B16>;
-defm SULD_1D_ARRAY_V4I32_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.trap", B32>;
-
-defm SULD_1D_ARRAY_V4I8_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.zero", B16>;
-defm SULD_1D_ARRAY_V4I16_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.zero", B16>;
-defm SULD_1D_ARRAY_V4I32_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_1D_ARRAY_V4I8_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8." # op, B16>;
+ defm SULD_1D_ARRAY_V4I16_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16." # op, B16>;
+ defm SULD_1D_ARRAY_V4I32_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32." # op, B32>;
+}
class SULD_2D_V4_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3901,17 +3800,12 @@ multiclass SULD_2D_V4<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_V4_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_V4I8_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b8.clamp", B16>;
-defm SULD_2D_V4I16_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b16.clamp", B16>;
-defm SULD_2D_V4I32_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b32.clamp", B32>;
-
-defm SULD_2D_V4I8_TRAP : SULD_2D_V4<"suld.b.2d.v4.b8.trap", B16>;
-defm SULD_2D_V4I16_TRAP : SULD_2D_V4<"suld.b.2d.v4.b16.trap", B16>;
-defm SULD_2D_V4I32_TRAP : SULD_2D_V4<"suld.b.2d.v4.b32.trap", B32>;
-
-defm SULD_2D_V4I8_ZERO : SULD_2D_V4<"suld.b.2d.v4.b8.zero", B16>;
-defm SULD_2D_V4I16_ZERO : SULD_2D_V4<"suld.b.2d.v4.b16.zero", B16>;
-defm SULD_2D_V4I32_ZERO : SULD_2D_V4<"suld.b.2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_V4I8_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b8." # op, B16>;
+ defm SULD_2D_V4I16_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b16." # op, B16>;
+ defm SULD_2D_V4I32_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b32." # op, B32>;
+}
class SULD_2D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3929,17 +3823,12 @@ multiclass SULD_2D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
def _I : SULD_2D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_2D_ARRAY_V4I8_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.clamp", B16>;
-defm SULD_2D_ARRAY_V4I16_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.clamp", B16>;
-defm SULD_2D_ARRAY_V4I32_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.clamp", B32>;
-
-defm SULD_2D_ARRAY_V4I8_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.trap", B16>;
-defm SULD_2D_ARRAY_V4I16_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.trap", B16>;
-defm SULD_2D_ARRAY_V4I32_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.trap", B32>;
-
-defm SULD_2D_ARRAY_V4I8_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.zero", B16>;
-defm SULD_2D_ARRAY_V4I16_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.zero", B16>;
-defm SULD_2D_ARRAY_V4I32_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_2D_ARRAY_V4I8_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8." # op, B16>;
+ defm SULD_2D_ARRAY_V4I16_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16." # op, B16>;
+ defm SULD_2D_ARRAY_V4I32_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32." # op, B32>;
+}
class SULD_3D_V4_base<string inst, NVPTXRegClass outtype, dag surf,
list<dag> pattern = []>
@@ -3956,17 +3845,12 @@ multiclass SULD_3D_V4<string inst, NVPTXRegClass outtype> {
def _I : SULD_3D_V4_base<inst, outtype, (ins i64imm:$s)>;
}
-defm SULD_3D_V4I8_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b8.clamp", B16>;
-defm SULD_3D_V4I16_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b16.clamp", B16>;
-defm SULD_3D_V4I32_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b32.clamp", B32>;
-
-defm SULD_3D_V4I8_TRAP : SULD_3D_V4<"suld.b.3d.v4.b8.trap", B16>;
-defm SULD_3D_V4I16_TRAP : SULD_3D_V4<"suld.b.3d.v4.b16.trap", B16>;
-defm SULD_3D_V4I32_TRAP : SULD_3D_V4<"suld.b.3d.v4.b32.trap", B32>;
-
-defm SULD_3D_V4I8_ZERO : SULD_3D_V4<"suld.b.3d.v4.b8.zero", B16>;
-defm SULD_3D_V4I16_ZERO : SULD_3D_V4<"suld.b.3d.v4.b16.zero", B16>;
-defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SULD_3D_V4I8_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b8." # op, B16>;
+ defm SULD_3D_V4I16_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b16." # op, B16>;
+ defm SULD_3D_V4I32_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b32." # op, B32>;
+}
}
@@ -4037,20 +3921,13 @@ multiclass SUST_1D<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", B16>;
-defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", B16>;
-defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", B32>;
-defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", B64>;
-
-defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", B16>;
-defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", B16>;
-defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", B32>;
-defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", B64>;
-
-defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", B16>;
-defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", B16>;
-defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", B32>;
-defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_I8_ # op_upper : SUST_1D<"sust.b.1d.b8." # op, B16>;
+ defm SUST_B_1D_I16_ # op_upper : SUST_1D<"sust.b.1d.b16." # op, B16>;
+ defm SUST_B_1D_I32_ # op_upper : SUST_1D<"sust.b.1d.b32." # op, B32>;
+ defm SUST_B_1D_I64_ # op_upper : SUST_1D<"sust.b.1d.b64." # op, B64>;
+}
defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", B16>;
defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", B16>;
@@ -4068,23 +3945,13 @@ multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s), []>;
}
-// int_nvvm_sust_b_1d_v2i8_clamp
-
-defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", B16>;
-defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", B16>;
-defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", B32>;
-defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", B64>;
-
-defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", B16>;
-defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", B16>;
-defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", B32>;
-defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", B64>;
-
-defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", B16>;
-defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", B16>;
-defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", B32>;
-defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_V2I8_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b8." # op, B16>;
+ defm SUST_B_1D_V2I16_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b16." # op, B16>;
+ defm SUST_B_1D_V2I32_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b32." # op, B32>;
+ defm SUST_B_1D_V2I64_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b64." # op, B64>;
+}
defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", B16>;
defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", B16>;
defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", B32>;
@@ -4103,17 +3970,12 @@ multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", B16>;
-defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", B16>;
-defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", B32>;
-
-defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", B16>;
-defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", B16>;
-defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", B32>;
-
-defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", B16>;
-defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", B16>;
-defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_V4I8_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b8." # op, B16>;
+ defm SUST_B_1D_V4I16_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b16." # op, B16>;
+ defm SUST_B_1D_V4I32_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b32." # op, B32>;
+}
defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", B16>;
defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", B16>;
@@ -4131,20 +3993,13 @@ multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_1D_ARRAY_I8_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_I16_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_I32_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", B32>;
-defm SUST_B_1D_ARRAY_I64_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", B64>;
-
-defm SUST_B_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", B32>;
-defm SUST_B_1D_ARRAY_I64_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", B64>;
-
-defm SUST_B_1D_ARRAY_I8_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_I16_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_I32_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", B32>;
-defm SUST_B_1D_ARRAY_I64_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_ARRAY_I8_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b8." # op, B16>;
+ defm SUST_B_1D_ARRAY_I16_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b16." # op, B16>;
+ defm SUST_B_1D_ARRAY_I32_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b32." # op, B32>;
+ defm SUST_B_1D_ARRAY_I64_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b64." # op, B64>;
+}
defm SUST_P_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", B16>;
defm SUST_P_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", B16>;
@@ -4164,20 +4019,13 @@ multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_1D_ARRAY_V2I8_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_V2I16_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_V2I32_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", B32>;
-defm SUST_B_1D_ARRAY_V2I64_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", B64>;
-
-defm SUST_B_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", B32>;
-defm SUST_B_1D_ARRAY_V2I64_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", B64>;
-
-defm SUST_B_1D_ARRAY_V2I8_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_V2I16_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_V2I32_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", B32>;
-defm SUST_B_1D_ARRAY_V2I64_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_ARRAY_V2I8_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8." # op, B16>;
+ defm SUST_B_1D_ARRAY_V2I16_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16." # op, B16>;
+ defm SUST_B_1D_ARRAY_V2I32_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32." # op, B32>;
+ defm SUST_B_1D_ARRAY_V2I64_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64." # op, B64>;
+}
defm SUST_P_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", B16>;
defm SUST_P_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", B16>;
@@ -4197,33 +4045,16 @@ multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_1D_ARRAY_V4I8_CLAMP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", B16>;
-defm SUST_B_1D_ARRAY_V4I16_CLAMP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", B16>;
-defm SUST_B_1D_ARRAY_V4I32_CLAMP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", B32>;
-
-defm SUST_B_1D_ARRAY_V4I8_TRAP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", B16>;
-defm SUST_B_1D_ARRAY_V4I16_TRAP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", B16>;
-defm SUST_B_1D_ARRAY_V4I32_TRAP
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", B32>;
-
-defm SUST_B_1D_ARRAY_V4I8_ZERO
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", B16>;
-defm SUST_B_1D_ARRAY_V4I16_ZERO
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", B16>;
-defm SUST_B_1D_ARRAY_V4I32_ZERO
- : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", B32>;
-
-defm SUST_P_1D_ARRAY_V4I8_TRAP
- : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>;
-defm SUST_P_1D_ARRAY_V4I16_TRAP
- : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>;
-defm SUST_P_1D_ARRAY_V4I32_TRAP
- : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_1D_ARRAY_V4I8_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8." # op, B16>;
+ defm SUST_B_1D_ARRAY_V4I16_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16." # op, B16>;
+ defm SUST_B_1D_ARRAY_V4I32_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32." # op, B32>;
+}
+
+defm SUST_P_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>;
+defm SUST_P_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>;
+defm SUST_P_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>;
class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
: NVPTXInst<(outs),
@@ -4237,20 +4068,13 @@ multiclass SUST_2D<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", B16>;
-defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", B16>;
-defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", B32>;
-defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", B64>;
-
-defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", B16>;
-defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", B16>;
-defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", B32>;
-defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", B64>;
-
-defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", B16>;
-defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", B16>;
-defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", B32>;
-defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_I8_ # op_upper : SUST_2D<"sust.b.2d.b8." # op, B16>;
+ defm SUST_B_2D_I16_ # op_upper : SUST_2D<"sust.b.2d.b16." # op, B16>;
+ defm SUST_B_2D_I32_ # op_upper : SUST_2D<"sust.b.2d.b32." # op, B32>;
+ defm SUST_B_2D_I64_ # op_upper : SUST_2D<"sust.b.2d.b64." # op, B64>;
+}
defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", B16>;
defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", B16>;
@@ -4270,20 +4094,13 @@ multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", B16>;
-defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", B16>;
-defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", B32>;
-defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", B64>;
-
-defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", B16>;
-defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", B16>;
-defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", B32>;
-defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", B64>;
-
-defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", B16>;
-defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", B16>;
-defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", B32>;
-defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_V2I8_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b8." # op, B16>;
+ defm SUST_B_2D_V2I16_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b16." # op, B16>;
+ defm SUST_B_2D_V2I32_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b32." # op, B32>;
+ defm SUST_B_2D_V2I64_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b64." # op, B64>;
+}
defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", B16>;
defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", B16>;
@@ -4303,17 +4120,12 @@ multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", B16>;
-defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", B16>;
-defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", B32>;
-
-defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", B16>;
-defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", B16>;
-defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", B32>;
-
-defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", B16>;
-defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", B16>;
-defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_V4I8_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b8." # op, B16>;
+ defm SUST_B_2D_V4I16_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b16." # op, B16>;
+ defm SUST_B_2D_V4I32_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b32." # op, B32>;
+}
defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", B16>;
defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", B16>;
@@ -4333,20 +4145,13 @@ multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_ARRAY_I8_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_I16_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_I32_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", B32>;
-defm SUST_B_2D_ARRAY_I64_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", B64>;
-
-defm SUST_B_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", B32>;
-defm SUST_B_2D_ARRAY_I64_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", B64>;
-
-defm SUST_B_2D_ARRAY_I8_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_I16_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_I32_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", B32>;
-defm SUST_B_2D_ARRAY_I64_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_ARRAY_I8_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b8." # op, B16>;
+ defm SUST_B_2D_ARRAY_I16_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b16." # op, B16>;
+ defm SUST_B_2D_ARRAY_I32_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b32." # op, B32>;
+ defm SUST_B_2D_ARRAY_I64_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b64." # op, B64>;
+}
defm SUST_P_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", B16>;
defm SUST_P_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", B16>;
@@ -4366,20 +4171,13 @@ multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_ARRAY_V2I8_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_V2I16_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_V2I32_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", B32>;
-defm SUST_B_2D_ARRAY_V2I64_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", B64>;
-
-defm SUST_B_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", B32>;
-defm SUST_B_2D_ARRAY_V2I64_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", B64>;
-
-defm SUST_B_2D_ARRAY_V2I8_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_V2I16_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_V2I32_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", B32>;
-defm SUST_B_2D_ARRAY_V2I64_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", B64>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_ARRAY_V2I8_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8." # op, B16>;
+ defm SUST_B_2D_ARRAY_V2I16_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16." # op, B16>;
+ defm SUST_B_2D_ARRAY_V2I32_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32." # op, B32>;
+ defm SUST_B_2D_ARRAY_V2I64_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64." # op, B64>;
+}
defm SUST_P_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", B16>;
defm SUST_P_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", B16>;
@@ -4399,17 +4197,12 @@ multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_2D_ARRAY_V4I8_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", B16>;
-defm SUST_B_2D_ARRAY_V4I16_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", B16>;
-defm SUST_B_2D_ARRAY_V4I32_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", B32>;
-
-defm SUST_B_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", B16>;
-defm SUST_B_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", B16>;
-defm SUST_B_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", B32>;
-
-defm SUST_B_2D_ARRAY_V4I8_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", B16>;
-defm SUST_B_2D_ARRAY_V4I16_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", B16>;
-defm SUST_B_2D_ARRAY_V4I32_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_2D_ARRAY_V4I8_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8." # op, B16>;
+ defm SUST_B_2D_ARRAY_V4I16_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16." # op, B16>;
+ defm SUST_B_2D_ARRAY_V4I32_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32." # op, B32>;
+}
defm SUST_P_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", B16>;
defm SUST_P_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", B16>;
@@ -4429,21 +4222,13 @@ multiclass SUST_3D<string inst, NVPTXRegClass intype> {
def _I : SUST_3D_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", B16>;
-defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", B16>;
-defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", B32>;
-defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", B64>;
-
-defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", B16>;
-defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", B16>;
-defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", B32>;
-defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", B64>;
-
-defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", B16>;
-defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", B16>;
-defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", B32>;
-defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_3D_I8_ # op_upper : SUST_3D<"sust.b.3d.b8." # op, B16>;
+ defm SUST_B_3D_I16_ # op_upper : SUST_3D<"sust.b.3d.b16." # op, B16>;
+ defm SUST_B_3D_I32_ # op_upper : SUST_3D<"sust.b.3d.b32." # op, B32>;
+ defm SUST_B_3D_I64_ # op_upper : SUST_3D<"sust.b.3d.b64." # op, B64>;
+}
defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", B16>;
defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", B16>;
defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", B32>;
@@ -4462,21 +4247,13 @@ multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", B16>;
-defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", B16>;
-defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", B32>;
-defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", B64>;
-
-defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", B16>;
-defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", B16>;
-defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", B32>;
-defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", B64>;
-
-defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", B16>;
-defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", B16>;
-defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", B32>;
-defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", B64>;
-
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_3D_V2I8_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b8." # op, B16>;
+ defm SUST_B_3D_V2I16_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b16." # op, B16>;
+ defm SUST_B_3D_V2I32_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b32." # op, B32>;
+ defm SUST_B_3D_V2I64_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b64." # op, B64>;
+}
defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", B16>;
defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", B16>;
defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", B32>;
@@ -4495,17 +4272,12 @@ multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s), []>;
}
-defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", B16>;
-defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", B16>;
-defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", B32>;
-
-defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", B16>;
-defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", B16>;
-defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", B32>;
-
-defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", B16>;
-defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", B16>;
-defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", B32>;
+foreach op = ["clamp", "trap", "zero"] in {
+ defvar op_upper = !toupper(op);
+ defm SUST_B_3D_V4I8_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b8." # op, B16>;
+ defm SUST_B_3D_V4I16_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b16." # op, B16>;
+ defm SUST_B_3D_V4I32_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b32." # op, B32>;
+}
defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", B16>;
defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", B16>;
@@ -5122,27 +4894,23 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align
//
// WGMMA fence instructions
//
-let isConvergent = true in {
-def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned",
- [(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
+let isConvergent = true, Predicates = [hasSM90a, hasPTX<80>] in {
+ def WGMMA_FENCE_SYNC_ALIGNED : NullaryInst<"wgmma.fence.sync.aligned", int_nvvm_wgmma_fence_sync_aligned>;
-def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned",
- [(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
+ def WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NullaryInst<"wgmma.commit_group.sync.aligned", int_nvvm_wgmma_commit_group_sync_aligned>;
-def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned",
- [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>;
-} // isConvergent = true
+ def WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned",
+ [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>;
+}
let Predicates = [hasSM<90>, hasPTX<78>] in {
def GRIDDEPCONTROL_LAUNCH_DEPENDENTS :
- BasicNVPTXInst<(outs), (ins), "griddepcontrol.launch_dependents",
- [(int_nvvm_griddepcontrol_launch_dependents)]>;
+ NullaryInst<"griddepcontrol.launch_dependents", int_nvvm_griddepcontrol_launch_dependents>;
def GRIDDEPCONTROL_WAIT :
- BasicNVPTXInst<(outs), (ins), "griddepcontrol.wait",
- [(int_nvvm_griddepcontrol_wait)]>;
+ NullaryInst<"griddepcontrol.wait", int_nvvm_griddepcontrol_wait>;
}
-def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>;
+def EXIT : NullaryInst<"exit", int_nvvm_exit>;
// Tcgen05 intrinsics
let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
@@ -5170,9 +4938,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1
defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
- def "" : BasicNVPTXInst<(outs), (ins),
- "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned",
- [(Intr)]>;
+ def "" : NullaryInst<"tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", Intr>;
}
defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index f4362fe..e2bbe57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -412,6 +412,22 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
+// alignment of the return value based on the alignment of the argument.
+static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,
+ Argument &Arg) {
+ CallInst *ArgInParam =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+ &Arg, {}, Arg.getName() + ".param");
+
+ if (MaybeAlign ParamAlign = Arg.getParamAlign())
+ ArgInParam->addRetAttr(
+ Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+
+ return ArgInParam;
+}
+
namespace {
struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
using Base = PtrUseVisitor<ArgUseChecker>;
@@ -515,10 +531,7 @@ void copyByValParam(Function &F, Argument &Arg) {
Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
- Value *ArgInParam =
- IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
- &Arg, {}, Arg.getName());
+ CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
@@ -549,9 +562,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
IRBuilder<> IRB(&*FirstInst);
- Value *ArgInParamAS = IRB.CreateIntrinsic(
- Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+ CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
@@ -581,10 +592,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
// argument already in the param address space, we need to use the noop
// intrinsic, this had the added benefit of preventing other optimizations
// from folding away this pair of addrspacecasts.
- auto *ParamSpaceArg =
- IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
- Arg, {}, Arg->getName() + ".param");
+ auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);
// Cast param address to generic address space.
Value *GenericArg = IRB.CreateAddrSpaceCast(
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
index 46aa27e..c8e576f 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
@@ -93,7 +93,7 @@ static bool clobbersCTR(const MachineInstr &MI) {
static bool verifyCTRBranch(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I) {
MachineBasicBlock::iterator BI = I;
- SmallSet<MachineBasicBlock *, 16> Visited;
+ SmallPtrSet<MachineBasicBlock *, 16> Visited;
SmallVector<MachineBasicBlock *, 8> Preds;
bool CheckPreds;
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.h b/llvm/lib/Target/PowerPC/PPCCallingConv.h
index ab61472..9c47142 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.h
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.h
@@ -21,28 +21,29 @@ namespace llvm {
bool RetCC_PPC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetCC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool RetCC_PPC_Cold(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_PPC32_SVR4(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_PPC64_ELF(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_PPC32_SVR4_ByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_PPC32_SVR4_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
} // End llvm namespace
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index e92e00f..0b68ba1 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1374,7 +1374,10 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value *> &Args,
unsigned LinkageSize = Subtarget->getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, Align(8));
- CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+ SmallVector<Type *, 16> ArgTys;
+ for (Value *Arg : Args)
+ ArgTys.push_back(Arg->getType());
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, ArgTys, CC_PPC64_ELF_FIS);
// Bail out if we can't handle any of the arguments.
for (const CCValAssign &VA : ArgLocs) {
@@ -1487,7 +1490,7 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte
if (RetVT != MVT::isVoid) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+ CCInfo.AnalyzeCallResult(RetVT, CLI.RetTy, RetCC_PPC64_ELF_FIS);
CCValAssign &VA = RVLocs[0];
assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
assert(VA.isRegLoc() && "Can only return in registers!");
@@ -1573,7 +1576,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
RetVT != MVT::f64) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
- CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+ CCInfo.AnalyzeCallResult(RetVT, RetTy, RetCC_PPC64_ELF_FIS);
if (RVLocs.size() > 1)
return false;
}
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index c0860fc..2ad3ed2 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2078,8 +2078,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
// tail call might not be in the new RestoreBlock, so real branch instruction
// won't be generated by emitEpilogue(), because shrink-wrap has chosen new
// RestoreBlock. So we handle this case here.
- if (MFI.getSavePoint() && MFI.hasTailCall()) {
- MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ if (!MFI.getSavePoints().empty() && MFI.hasTailCall()) {
+ assert(MFI.getRestorePoints().size() < 2 &&
+ "MFI can't contain multiple restore points!");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
for (MachineBasicBlock &MBB : MF) {
if (MBB.isReturnBlock() && (&MBB) != RestoreBlock)
createTailCallBranchInstr(MBB);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2698bd6..652edd4 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1787,11 +1787,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
case PPCISD::PADDI_DTPREL:
return "PPCISD::PADDI_DTPREL";
- case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
- case PPCISD::SC: return "PPCISD::SC";
- case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
- case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
- case PPCISD::RFEBB: return "PPCISD::RFEBB";
+ case PPCISD::VADD_SPLAT:
+ return "PPCISD::VADD_SPLAT";
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
@@ -4051,18 +4048,13 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Ty = IntPtrTy;
- Entry.Node = Trmp; Args.push_back(Entry);
-
+ Args.emplace_back(Trmp, IntPtrTy);
// TrampSize == (isPPC64 ? 48 : 40);
- Entry.Node =
- DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT());
- Args.push_back(Entry);
-
- Entry.Node = FPtr; Args.push_back(Entry);
- Entry.Node = Nest; Args.push_back(Entry);
+ Args.emplace_back(
+ DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT()),
+ IntPtrTy);
+ Args.emplace_back(FPtr, IntPtrTy);
+ Args.emplace_back(Nest, IntPtrTy);
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -6091,10 +6083,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
if (!ArgFlags.isVarArg()) {
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
- CCInfo);
+ Outs[i].OrigTy, CCInfo);
} else {
Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo);
+ ArgFlags, Outs[i].OrigTy, CCInfo);
}
if (Result) {
@@ -6905,7 +6897,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State) {
+ Type *OrigTy, CCState &State) {
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
State.getMachineFunction().getSubtarget());
const bool IsPPC64 = Subtarget.isPPC64();
@@ -14822,9 +14814,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
SDValue Chain = LD->getChain();
EVT VT = LD->getMemoryVT();
- SmallSet<SDNode *, 16> LoadRoots;
+ SmallPtrSet<SDNode *, 16> LoadRoots;
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
- SmallSet<SDNode *, 16> Visited;
+ SmallPtrSet<SDNode *, 16> Visited;
// First, search up the chain, branching to follow all token-factor operands.
// If we find a consecutive load, then we're done, otherwise, record all
@@ -19553,12 +19545,10 @@ SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, false);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (const SDValue &N : Op->op_values()) {
EVT ArgVT = N.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = N;
- Entry.Ty = ArgTy;
+ TargetLowering::ArgListEntry Entry(N, ArgTy);
Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, SignExtend);
Entry.IsZExt = !Entry.IsSExt;
Args.push_back(Entry);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 9755f0e..5e0d6bf 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -430,20 +430,6 @@ namespace llvm {
/// optimizations due to constant folding.
VADD_SPLAT,
- /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned
- /// operand identifies the operating system entry point.
- SC,
-
- /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
- CLRBHRB,
-
- /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
- /// history rolling buffer entry.
- MFBHRBE,
-
- /// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
- RFEBB,
-
/// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
/// endian. Maps to an xxswapd instruction that corrects an lxvd2x
/// or stxvd2x instruction. The chain is necessary because the
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 24287a9..79fe12e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1630,9 +1630,11 @@ def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>;
// Decimal Copy-Sign/Set-Sign
let Defs = [CR6] in
-def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
+def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.",
+ [(set v16i8:$VD, (int_ppc_bcdcopysign v16i8:$VA, v16i8:$VB))]>;
-def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
+def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.",
+ [(set v16i8:$VD, (int_ppc_bcdsetsign v16i8:$VB, i32:$PS))]>;
// Decimal Shift/Unsigned-Shift/Shift-and-Round
def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 99ef89a..c2f91ce 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -365,16 +365,6 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPSideEffect]>;
-def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc,
- [SDNPHasChain, SDNPSideEffect]>;
-
-def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone,
- [SDNPHasChain, SDNPSideEffect]>;
-def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
-def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
- [SDNPHasChain, SDNPSideEffect]>;
-
def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
def PPCvcmp_rec : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
@@ -1673,7 +1663,7 @@ let isBranch = 1, isTerminator = 1, Size = 0 in {
// System call.
let PPC970_Unit = 7 in {
def SC : SCForm<17, 1, 0, (outs), (ins i32imm:$LEV),
- "sc $LEV", IIC_BrB, [(PPCsc (i32 imm:$LEV))]>;
+ "sc $LEV", IIC_BrB, []>;
}
// We mark SCV as having no scheduling model since it is only meant to be used
@@ -1685,21 +1675,14 @@ let Predicates = [IsISA3_0], hasNoSchedulingInfo = 1 in {
}
// Branch history rolling buffer.
-def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
- [(PPCclrbhrb)]>,
+def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, []>,
PPC970_DGroup_Single;
-// The $dmy argument used for MFBHRBE is not needed; however, including
-// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
-// interferes with necessary special handling (see PPCFastISel.cpp).
-def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT),
- (ins u10imm:$imm, u10imm:$dmy),
- "mfbhrbe $RT, $imm", IIC_BrB,
- [(set i32:$RT,
- (PPCmfbhrbe imm:$imm, imm:$dmy))]>,
+
+def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), (ins u10imm:$imm),
+ "mfbhrbe $RT, $imm", IIC_BrB, []>,
PPC970_DGroup_First;
-def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S",
- IIC_BrB, [(PPCrfebb (i32 imm:$S))]>,
+def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", IIC_BrB, []>,
PPC970_DGroup_Single;
def : InstAlias<"rfebb", (RFEBB 1)>;
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 709d7e7..adf9436 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -264,9 +264,8 @@ namespace {
bool prepareBasesForCommoningChains(Bucket &BucketChain);
/// Rewrite load/store according to the common chains.
- bool
- rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
- SmallSet<BasicBlock *, 16> &BBChanged);
+ bool rewriteLoadStoresForCommoningChains(
+ Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged);
/// Collect condition matched(\p isValidCandidate() returns true)
/// candidates in Loop \p L.
@@ -309,7 +308,7 @@ namespace {
/// Rewrite load/store instructions in \p BucketChain according to
/// preparation.
bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
- SmallSet<BasicBlock *, 16> &BBChanged,
+ SmallPtrSet<BasicBlock *, 16> &BBChanged,
PrepForm Form);
/// Rewrite for the base load/store of a chain.
@@ -523,7 +522,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
if (Buckets.empty())
return MadeChange;
- SmallSet<BasicBlock *, 16> BBChanged;
+ SmallPtrSet<BasicBlock *, 16> BBChanged;
for (auto &Bucket : Buckets) {
if (prepareBasesForCommoningChains(Bucket))
@@ -537,7 +536,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
}
bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
- Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
+ Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged) {
bool MadeChange = false;
assert(Bucket.Elements.size() ==
@@ -1006,7 +1005,7 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
}
bool PPCLoopInstrFormPrep::rewriteLoadStores(
- Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
+ Loop *L, Bucket &BucketChain, SmallPtrSet<BasicBlock *, 16> &BBChanged,
PrepForm Form) {
bool MadeChange = false;
@@ -1089,7 +1088,7 @@ bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
bool MadeChange = false;
if (Buckets.empty())
return MadeChange;
- SmallSet<BasicBlock *, 16> BBChanged;
+ SmallPtrSet<BasicBlock *, 16> BBChanged;
for (auto &Bucket : Buckets)
// The base address of each bucket is transformed into a phi and the others
// are rewritten based on new base.
@@ -1110,7 +1109,7 @@ bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
if (Buckets.empty())
return MadeChange;
- SmallSet<BasicBlock *, 16> BBChanged;
+ SmallPtrSet<BasicBlock *, 16> BBChanged;
for (auto &Bucket : Buckets) {
if (Bucket.Elements.size() < DispFormPrepMinThreshold)
continue;
diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 0ffd35d..74bce43 100644
--- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -248,6 +248,10 @@ static bool splitMBB(BlockSplitInfo &BSI) {
}
addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
+ // Set the call frame size on ThisMBB to the new basic blocks.
+ // See https://reviews.llvm.org/D156113.
+ NewMBB->setCallFrameSize(TII->getCallFrameSizeAt(ThisMBB->back()));
+
LLVM_DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
LLVM_DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
LLVM_DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76dca47..f123040 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
SpillsKnownBit = true;
break;
default:
+ // When spilling a CR bit, the super register may not be explicitly defined
+ // (i.e. it can be defined by a CR-logical that only defines the subreg) so
+ // we state that the CR field is undef. Also, in order to preserve the kill
+ // flag on the CR bit, we add it as an implicit use.
+
// On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
// bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
// the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
// register), and SETNBC will set this.
if (Subtarget.isISA3_1()) {
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
- .addReg(SrcReg, RegState::Undef);
+ .addReg(SrcReg, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit |
+ getKillRegState(MI.getOperand(0).isKill()));
break;
}
@@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT ||
SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) {
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg)
- .addReg(getCRFromCRBit(SrcReg), RegState::Undef);
+ .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit |
+ getKillRegState(MI.getOperand(0).isKill()));
break;
}
}
// We need to move the CR field that contains the CR bit we are spilling.
- // The super register may not be explicitly defined (i.e. it can be defined
- // by a CR-logical that only defines the subreg) so we state that the CR
- // field is undef. Also, in order to preserve the kill flag on the CR bit,
- // we add it as an implicit use.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
.addReg(getCRFromCRBit(SrcReg), RegState::Undef)
.addReg(SrcReg,
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d71c42c..d37ae2f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -903,6 +903,7 @@ public:
VK == RISCV::S_QC_ABS20;
}
+ bool isSImm8Unsigned() const { return isSImm<8>() || isUImm<8>(); }
bool isSImm10Unsigned() const { return isSImm<10>() || isUImm<10>(); }
bool isUImm20LUI() const {
@@ -1199,6 +1200,14 @@ public:
addExpr(Inst, getImm(), isRV64Imm());
}
+ void addSImm8UnsignedOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ int64_t Imm;
+ [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm);
+ assert(IsConstant);
+ Inst.addOperand(MCOperand::createImm(SignExtend64<8>(Imm)));
+ }
+
void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
int64_t Imm;
@@ -1547,6 +1556,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 9) - 8,
"immediate must be a multiple of 8 bytes in the range");
+ case Match_InvalidSImm8Unsigned:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 7),
+ (1 << 8) - 1);
case Match_InvalidSImm10:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 9),
(1 << 9) - 1);
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index e0ac591..78be55b 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -672,6 +672,8 @@ static constexpr FeatureBitset XAndesGroup = {
RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH,
RISCV::FeatureVendorXAndesVDot};
+static constexpr FeatureBitset XSMTGroup = {RISCV::FeatureVendorXSMTVDot};
+
static constexpr DecoderListEntry DecoderList32[]{
// Vendor Extensions
{DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
@@ -692,6 +694,7 @@ static constexpr DecoderListEntry DecoderList32[]{
{RISCV::FeatureVendorXMIPSCBOP},
"MIPS mips.pref"},
{DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
+ {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"},
// Standard Extensions
{DecoderTable32, {}, "standard 32-bit instructions"},
{DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"},
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index f83c2b6..51ea3fc 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -736,7 +736,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
}
case TargetOpcode::G_FCONSTANT: {
// TODO: Use constant pool for complex constants.
- // TODO: Optimize +0.0 to use fcvt.d.w for s64 on rv32.
Register DstReg = MI.getOperand(0).getReg();
const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
APInt Imm = FPimm.bitcastToAPInt();
@@ -753,8 +752,22 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
if (!FMV.constrainAllUses(TII, TRI, RBI))
return false;
} else {
+ // s64 on rv32
assert(Size == 64 && !Subtarget->is64Bit() &&
"Unexpected size or subtarget");
+
+ if (Imm.isNonNegative() && Imm.isZero()) {
+ // Optimize +0.0 to use fcvt.d.w
+ MachineInstrBuilder FCVT =
+ MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)})
+ .addImm(RISCVFPRndMode::RNE);
+ if (!FCVT.constrainAllUses(TII, TRI, RBI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+ }
+
// Split into two pieces and build through the stack.
Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass);
Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 8d956ce..96f22c2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -819,6 +819,23 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
Asm->getWriter().recordRelocation(F, VendorFixup, VendorTarget, VendorValue);
}
+static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
+ // Some Fixups are marked as LinkerRelaxable by
+ // `RISCVMCCodeEmitter::getImmOpValue` only because they may be
+ // (assembly-)relaxed into a linker-relaxable instruction. This function
+ // should return `false` for those fixups so they do not get a `R_RISCV_RELAX`
+ // relocation emitted in addition to the relocation.
+ switch (Kind) {
+ default:
+ break;
+ case RISCV::fixup_riscv_rvc_jump:
+ case RISCV::fixup_riscv_rvc_branch:
+ case RISCV::fixup_riscv_jal:
+ return false;
+ }
+ return true;
+}
+
bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
const MCValue &Target, uint64_t &FixedValue,
bool IsResolved) {
@@ -861,25 +878,32 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
return false;
}
- // If linker relaxation is enabled and supported by the current relocation,
- // generate a relocation and then append a RELAX.
- if (Fixup.isLinkerRelaxable())
+ // If linker relaxation is enabled and supported by the current fixup, then we
+ // always want to generate a relocation.
+ bool NeedsRelax = Fixup.isLinkerRelaxable() &&
+ relaxableFixupNeedsRelocation(Fixup.getKind());
+ if (NeedsRelax)
IsResolved = false;
+
if (IsResolved && Fixup.isPCRel())
IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
if (!IsResolved) {
- // Some Fixups require a vendor relocation, record it (directly) before we
+ // Some Fixups require a VENDOR relocation, record it (directly) before we
// add the relocation.
maybeAddVendorReloc(F, Fixup);
Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
- }
- if (Fixup.isLinkerRelaxable()) {
- auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
- Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
- FixedValueA);
+ if (NeedsRelax) {
+ // Some Fixups get a RELAX relocation, record it (directly) after we add
+ // the relocation.
+ MCFixup RelaxFixup =
+ MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
+ MCValue RelaxTarget = MCValue::get(nullptr);
+ uint64_t RelaxValue;
+ Asm->getWriter().recordRelocation(F, RelaxFixup, RelaxTarget, RelaxValue);
+ }
}
return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index bddea43..083ac05 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -139,6 +139,9 @@ enum {
// 3 -> SEW * 4
DestEEWShift = ElementsDependOnMaskShift + 1,
DestEEWMask = 3ULL << DestEEWShift,
+
+ ReadsPastVLShift = DestEEWShift + 2,
+ ReadsPastVLMask = 1ULL << ReadsPastVLShift,
};
// Helper functions to read TSFlags.
@@ -195,6 +198,12 @@ static inline bool elementsDependOnMask(uint64_t TSFlags) {
return TSFlags & ElementsDependOnMaskMask;
}
+/// \returns true if the instruction may read elements past VL, e.g.
+/// vslidedown/vrgather
+static inline bool readsPastVL(uint64_t TSFlags) {
+ return TSFlags & ReadsPastVLMask;
+}
+
static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
const uint64_t TSFlags = Desc.TSFlags;
// This method is only called if we expect to have a VL operand, and all
@@ -337,6 +346,7 @@ enum OperandType : unsigned {
OPERAND_SIMM5_PLUS1,
OPERAND_SIMM6,
OPERAND_SIMM6_NONZERO,
+ OPERAND_SIMM8,
OPERAND_SIMM10,
OPERAND_SIMM10_LSB0000_NONZERO,
OPERAND_SIMM11,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 8c9ab8e..b0c27ce 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -75,7 +75,7 @@ void RISCVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (PrintAliases && !NoAliases)
Res = RISCVRVC::uncompress(UncompressedMI, *MI, STI);
if (Res)
- NewMI = const_cast<MCInst *>(&UncompressedMI);
+ NewMI = &UncompressedMI;
if (!PrintAliases || NoAliases || !printAliasInstr(NewMI, Address, STI, O))
printInstruction(NewMI, Address, STI, O);
printAnnotation(O, Annot);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index cbeabdd..717fba6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -576,8 +576,21 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
"getImmOpValue expects only expressions or immediates");
const MCExpr *Expr = MO.getExpr();
MCExpr::ExprKind Kind = Expr->getKind();
- unsigned FixupKind = RISCV::fixup_riscv_invalid;
+
+ // `RelaxCandidate` must be set to `true` in two cases:
+ // - The fixup's relocation gets a R_RISCV_RELAX relocation
+ // - The underlying instruction may be relaxed to an instruction that gets a
+ // `R_RISCV_RELAX` relocation.
+ //
+ // The actual emission of `R_RISCV_RELAX` will be handled in
+ // `RISCVAsmBackend::applyFixup`.
bool RelaxCandidate = false;
+ auto AsmRelaxToLinkerRelaxableWithFeature = [&](unsigned Feature) -> void {
+ if (!STI.hasFeature(RISCV::FeatureExactAssembly) && STI.hasFeature(Feature))
+ RelaxCandidate = true;
+ };
+
+ unsigned FixupKind = RISCV::fixup_riscv_invalid;
if (Kind == MCExpr::Specifier) {
const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
FixupKind = RVExpr->getSpecifier();
@@ -644,18 +657,26 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
// FIXME: Sub kind binary exprs have chance of underflow.
if (MIFrm == RISCVII::InstFormatJ) {
FixupKind = RISCV::fixup_riscv_jal;
+ AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
} else if (MIFrm == RISCVII::InstFormatB) {
FixupKind = RISCV::fixup_riscv_branch;
+ // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+ // the `jal` again in the assembler.
} else if (MIFrm == RISCVII::InstFormatCJ) {
FixupKind = RISCV::fixup_riscv_rvc_jump;
+ AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
} else if (MIFrm == RISCVII::InstFormatCB) {
FixupKind = RISCV::fixup_riscv_rvc_branch;
+ // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+ // the `jal` again in the assembler.
} else if (MIFrm == RISCVII::InstFormatCI) {
FixupKind = RISCV::fixup_riscv_rvc_imm;
} else if (MIFrm == RISCVII::InstFormatI) {
FixupKind = RISCV::fixup_riscv_12_i;
} else if (MIFrm == RISCVII::InstFormatQC_EB) {
FixupKind = RISCV::fixup_riscv_qc_e_branch;
+ // This might be assembler relaxed to `qc.e.b<cc>; jal` but we cannot
+ // relax the `jal` again in the assembler.
} else if (MIFrm == RISCVII::InstFormatQC_EAI) {
FixupKind = RISCV::fixup_riscv_qc_e_32;
RelaxCandidate = true;
@@ -670,9 +691,9 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
assert(FixupKind != RISCV::fixup_riscv_invalid && "Unhandled expression!");
addFixup(Fixups, 0, Expr, FixupKind);
- // If linker relaxation is enabled and supported by this relocation, set
- // a bit so that if fixup is unresolved, a R_RISCV_RELAX relocation will be
- // appended.
+ // If linker relaxation is enabled and supported by this relocation, set a bit
+ // so that the assembler knows the size of the instruction is not fixed/known,
+ // and the relocation will need a R_RISCV_RELAX relocation.
if (EnableRelax && RelaxCandidate)
Fixups.back().setLinkerRelaxable();
++MCNumFixups;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index 70127e3..78f4779 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -741,7 +741,7 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
bool llvm::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State) {
+ Type *OrigTy, CCState &State) {
if (ArgFlags.isNest()) {
report_fatal_error(
"Attribute 'nest' is not supported in GHC calling convention");
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index 2030ce1..0847dd6 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -33,7 +33,7 @@ bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a7329d2..d4ac3c6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1055,13 +1055,13 @@ def FeatureStdExtSupm
"Indicates User-mode Pointer Masking">;
def FeatureStdExtSmctr
- : RISCVExperimentalExtension<1, 0,
- "Control Transfer Records Machine Level",
- [FeatureStdExtSscsrind]>;
+ : RISCVExtension<1, 0,
+ "Control Transfer Records Machine Level",
+ [FeatureStdExtSscsrind]>;
def FeatureStdExtSsctr
- : RISCVExperimentalExtension<1, 0,
- "Control Transfer Records Supervisor Level",
- [FeatureStdExtSscsrind]>;
+ : RISCVExtension<1, 0,
+ "Control Transfer Records Supervisor Level",
+ [FeatureStdExtSscsrind]>;
def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">,
AssemblerPredicate<(any_of FeatureStdExtSmctr, FeatureStdExtSsctr),
"'Smctr' (Control Transfer Records Machine Level) or "
@@ -1642,6 +1642,14 @@ def HasVendorXAndesVDot
AssemblerPredicate<(all_of FeatureVendorXAndesVDot),
"'XAndesVDot' (Andes Vector Dot Product Extension)">;
+def FeatureVendorXSMTVDot
+ : RISCVExtension<1, 0, "SpacemiT Vector Dot Product Extension",
+ [FeatureStdExtZve32f]>;
+def HasVendorXSMTVDot
+ : Predicate<"Subtarget->hasVendorXSMTVDot()">,
+ AssemblerPredicate<(all_of FeatureVendorXSMTVDot),
+ "'XSMTVDot' (SpacemiT Vector Dot Product Extension)">;
+
//===----------------------------------------------------------------------===//
// LLVM specific features and extensions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5998653..f9f35f6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
#include "RISCVInstrInfo.h"
#include "RISCVSelectionDAGInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
@@ -681,40 +682,86 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
if (!Subtarget->hasVendorXqcibm())
return false;
- auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
- if (!N1C)
+ using namespace SDPatternMatch;
+
+ SDValue X;
+ APInt MaskImm;
+ if (!sd_match(Node, m_Or(m_OneUse(m_Value(X)), m_ConstInt(MaskImm))))
return false;
- int32_t C1 = N1C->getSExtValue();
- if (!isShiftedMask_32(C1) || isInt<12>(C1))
+ unsigned ShAmt, Width;
+ if (!MaskImm.isShiftedMask(ShAmt, Width) || MaskImm.isSignedIntN(12))
return false;
- // INSBI will clobber the input register in N0. Bail out if we need a copy to
- // preserve this value.
- SDValue N0 = Node->getOperand(0);
- if (!N0.hasOneUse())
+ // If Zbs is enabled and it is a single bit set we can use BSETI which
+ // can be compressed to C_BSETI when Xqcibm in enabled.
+ if (Width == 1 && Subtarget->hasStdExtZbs())
return false;
// If C1 is a shifted mask (but can't be formed as an ORI),
// use a bitfield insert of -1.
// Transform (or x, C1)
// -> (qc.insbi x, -1, width, shift)
- const unsigned Leading = llvm::countl_zero((uint32_t)C1);
- const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
- const unsigned Width = 32 - Leading - Trailing;
+ SDLoc DL(Node);
+ MVT VT = Node->getSimpleValueType(0);
- // If Zbs is enabled and it is a single bit set we can use BSETI which
- // can be compressed to C_BSETI when Xqcibm in enabled.
- if (Width == 1 && Subtarget->hasStdExtZbs())
+ SDValue Ops[] = {X, CurDAG->getSignedTargetConstant(-1, DL, VT),
+ CurDAG->getTargetConstant(Width, DL, VT),
+ CurDAG->getTargetConstant(ShAmt, DL, VT)};
+ SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
+ ReplaceNode(Node, BitIns);
+ return true;
+}
+
+// Generate a QC_INSB/QC_INSBI from 'or (and X, MaskImm), OrImm' iff the value
+// being inserted only sets known zero bits.
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromOrAndImm(SDNode *Node) {
+ // Supported only in Xqcibm for now.
+ if (!Subtarget->hasVendorXqcibm())
+ return false;
+
+ using namespace SDPatternMatch;
+
+ SDValue And;
+ APInt MaskImm, OrImm;
+ if (!sd_match(Node, m_Or(m_OneUse(m_And(m_Value(And), m_ConstInt(MaskImm))),
+ m_ConstInt(OrImm))))
+ return false;
+
+ // Compute the Known Zero for the AND as this allows us to catch more general
+ // cases than just looking for AND with imm.
+ KnownBits Known = CurDAG->computeKnownBits(Node->getOperand(0));
+
+ // The bits being inserted must only set those bits that are known to be zero.
+ if (!OrImm.isSubsetOf(Known.Zero)) {
+ // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+ // currently handle this case.
+ return false;
+ }
+
+ unsigned ShAmt, Width;
+ // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+ if (!Known.Zero.isShiftedMask(ShAmt, Width))
return false;
+ // QC_INSB(I) dst, src, #width, #shamt.
SDLoc DL(Node);
MVT VT = Node->getSimpleValueType(0);
+ SDValue ImmNode;
+ auto Opc = RISCV::QC_INSB;
- SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
- CurDAG->getTargetConstant(Width, DL, VT),
- CurDAG->getTargetConstant(Trailing, DL, VT)};
- SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
+ int32_t LIImm = OrImm.getSExtValue() >> ShAmt;
+
+ if (isInt<5>(LIImm)) {
+ Opc = RISCV::QC_INSBI;
+ ImmNode = CurDAG->getSignedTargetConstant(LIImm, DL, MVT::i32);
+ } else {
+ ImmNode = selectImm(CurDAG, DL, MVT::i32, LIImm, *Subtarget);
+ }
+
+ SDValue Ops[] = {And, ImmNode, CurDAG->getTargetConstant(Width, DL, VT),
+ CurDAG->getTargetConstant(ShAmt, DL, VT)};
+ SDNode *BitIns = CurDAG->getMachineNode(Opc, DL, VT, Ops);
ReplaceNode(Node, BitIns);
return true;
}
@@ -772,6 +819,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
return false;
}
+// (xor X, (and (xor X, C1), C2))
+// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width=Width and shift=ShAmt
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
+
+ if (!Subtarget->hasVendorXqcibm())
+ return false;
+
+ using namespace SDPatternMatch;
+
+ SDValue X;
+ APInt CImm, CMask;
+ if (!sd_match(
+ Node,
+ m_Xor(m_Value(X),
+ m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
+ m_ConstInt(CMask))))))
+ return false;
+
+ unsigned Width, ShAmt;
+ if (!CMask.isShiftedMask(ShAmt, Width))
+ return false;
+
+ int64_t Imm = CImm.getSExtValue();
+ Imm >>= ShAmt;
+
+ SDLoc DL(Node);
+ SDValue ImmNode;
+ auto Opc = RISCV::QC_INSB;
+
+ if (isInt<5>(Imm)) {
+ Opc = RISCV::QC_INSBI;
+ ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
+ } else {
+ ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
+ }
+ SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
+ CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb,
@@ -1340,6 +1430,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (trySignedBitfieldInsertInMask(Node))
return;
+ if (tryBitfieldInsertOpFromOrAndImm(Node))
+ return;
+
if (tryShrinkShlLogicImm(Node))
return;
@@ -1349,6 +1442,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (tryShrinkShlLogicImm(Node))
return;
+ if (tryBitfieldInsertOpFromXor(Node))
+ return;
+
break;
case ISD::AND: {
auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -1644,7 +1740,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// available.
// Transform (and x, C1)
// -> (<bfextract> x, msb, lsb)
- if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue())) {
+ if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue()) &&
+ !(C1 == 0xffff && Subtarget->hasStdExtZbb()) &&
+ !(C1 == 0xffffffff && Subtarget->hasStdExtZba())) {
const unsigned Msb = llvm::bit_width(C1) - 1;
if (tryUnsignedBitfieldExtract(Node, DL, VT, N0, Msb, 0))
return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index ee3a86e..c329a4c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -75,6 +75,8 @@ public:
bool trySignedBitfieldExtract(SDNode *Node);
bool trySignedBitfieldInsertInSign(SDNode *Node);
bool trySignedBitfieldInsertInMask(SDNode *Node);
+ bool tryBitfieldInsertOpFromXor(SDNode *Node);
+ bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node);
bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb, unsigned Lsb);
bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e4aa8b8..4a1db80 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
/*IsStore*/ true,
/*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ // Operands are (vec, ..., vec, ptr, offset, mask, vl)
+ return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
+ /*IsStore*/ true,
+ /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
case Intrinsic::riscv_vlm:
return SetRVVLoadStoreInfo(/*PtrOp*/ 0,
/*IsStore*/ false,
@@ -2512,11 +2523,11 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
}
break;
case ISD::SETUGT:
- if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isInt<16>(C + 1) &&
- C != -1) {
+ if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isUInt<16>(C + 1)) {
// We have a branch immediate instruction for SETUGE but not SETUGT.
- // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit signed immediate.
- RHS = DAG.getSignedConstant(C + 1, DL, RHS.getValueType());
+ // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit unsigned
+ // immediate.
+ RHS = DAG.getConstant(C + 1, DL, RHS.getValueType());
CC = ISD::SETUGE;
return;
}
@@ -8931,10 +8942,7 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
// Prepare argument list to generate call.
ArgListTy Args;
- ArgListEntry Entry;
- Entry.Node = Load;
- Entry.Ty = CallTy;
- Args.push_back(Entry);
+ Args.emplace_back(Load, CallTy);
// Setup call to __tls_get_addr.
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -11084,69 +11092,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
}
-SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
- SelectionDAG &DAG) const {
- unsigned IntNo = Op.getConstantOperandVal(1);
+static SDValue
+lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op,
+ const RISCVSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool IsStrided;
switch (IntNo) {
- default:
- break;
case Intrinsic::riscv_seg2_store_mask:
case Intrinsic::riscv_seg3_store_mask:
case Intrinsic::riscv_seg4_store_mask:
case Intrinsic::riscv_seg5_store_mask:
case Intrinsic::riscv_seg6_store_mask:
case Intrinsic::riscv_seg7_store_mask:
- case Intrinsic::riscv_seg8_store_mask: {
- SDLoc DL(Op);
- static const Intrinsic::ID VssegInts[] = {
- Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
- Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
- Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
- Intrinsic::riscv_vsseg8_mask};
+ case Intrinsic::riscv_seg8_store_mask:
+ IsStrided = false;
+ break;
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ IsStrided = true;
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ }
- // Operands: (chain, int_id, vec*, ptr, mask, vl)
- unsigned NF = Op->getNumOperands() - 5;
- assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
- MVT XLenVT = Subtarget.getXLenVT();
- MVT VT = Op->getOperand(2).getSimpleValueType();
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
- unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
- ContainerVT.getScalarSizeInBits();
- EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
+ SDLoc DL(Op);
+ static const Intrinsic::ID VssegInts[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask};
+ static const Intrinsic::ID VsssegInts[] = {
+ Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask,
+ Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask,
+ Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask,
+ Intrinsic::riscv_vssseg8_mask};
+
+ // Operands: (chain, int_id, vec*, ptr, mask, vl) or
+ // (chain, int_id, vec*, ptr, stride, mask, vl)
+ unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5);
+ assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT VT = Op->getOperand(2).getSimpleValueType();
+ MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
+ ContainerVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
- SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
- SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
- MVT MaskVT = Mask.getSimpleValueType();
- MVT MaskContainerVT =
- ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
- Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+ SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+ SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
+ MVT MaskVT = Mask.getSimpleValueType();
+ MVT MaskContainerVT =
+ ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+ Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
- SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
- SDValue Ptr = Op->getOperand(NF + 2);
+ SDValue IntID = DAG.getTargetConstant(
+ IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT);
+ SDValue Ptr = Op->getOperand(NF + 2);
- auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
+ auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
- SDValue StoredVal = DAG.getUNDEF(VecTupTy);
- for (unsigned i = 0; i < NF; i++)
- StoredVal = DAG.getNode(
- RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
- convertToScalableVector(
- ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
- DAG.getTargetConstant(i, DL, MVT::i32));
+ SDValue StoredVal = DAG.getUNDEF(VecTupTy);
+ for (unsigned i = 0; i < NF; i++)
+ StoredVal = DAG.getNode(
+ RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i),
+ DAG, Subtarget),
+ DAG.getTargetConstant(i, DL, MVT::i32));
+
+ SmallVector<SDValue, 10> Ops = {
+ FixedIntrinsic->getChain(),
+ IntID,
+ StoredVal,
+ Ptr,
+ Mask,
+ VL,
+ DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
+ // Insert the stride operand.
+ if (IsStrided)
+ Ops.insert(std::next(Ops.begin(), 4),
+ Op.getOperand(Op.getNumOperands() - 3));
+
+ return DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
+ FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::riscv_seg2_store_mask:
+ case Intrinsic::riscv_seg3_store_mask:
+ case Intrinsic::riscv_seg4_store_mask:
+ case Intrinsic::riscv_seg5_store_mask:
+ case Intrinsic::riscv_seg6_store_mask:
+ case Intrinsic::riscv_seg7_store_mask:
+ case Intrinsic::riscv_seg8_store_mask:
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG);
- SDValue Ops[] = {
- FixedIntrinsic->getChain(),
- IntID,
- StoredVal,
- Ptr,
- Mask,
- VL,
- DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
-
- return DAG.getMemIntrinsicNode(
- ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
- FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
- }
case Intrinsic::riscv_sf_vc_xv_se:
return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE);
case Intrinsic::riscv_sf_vc_iv_se:
@@ -14273,7 +14330,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
MakeLibCallOptions CallOptions;
EVT OpVT = Op0.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -14308,7 +14365,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
MakeLibCallOptions CallOptions;
EVT OpVT = Op0.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
Results.push_back(Result);
@@ -16531,8 +16588,10 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
// can become a sext.w instead of a shift pair.
-static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performSETCCCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -16548,6 +16607,20 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
return V;
+ // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI.
+ if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) &&
+ N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
+ isa<ConstantSDNode>(N0.getOperand(1))) {
+ const APInt &AndRHSC =
+ cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) {
+ unsigned ShiftBits = AndRHSC.countr_zero();
+ SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0),
+ DAG.getConstant(ShiftBits, dl, VT));
+ return DAG.getSetCC(dl, VT, Shift, N1, Cond);
+ }
+ }
+
if (OpVT != MVT::i64 || !Subtarget.is64Bit())
return SDValue();
@@ -16582,27 +16655,39 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
}
static SDValue
-performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
SDValue Src = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
unsigned Opc = Src.getOpcode();
+ SDLoc DL(N);
// Fold (sext_inreg (fmv_x_anyexth X), i16) -> (fmv_x_signexth X)
// Don't do this with Zhinx. We need to explicitly sign extend the GPR.
if (Opc == RISCVISD::FMV_X_ANYEXTH && SrcVT.bitsGE(MVT::i16) &&
Subtarget.hasStdExtZfhmin())
- return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, SDLoc(N), VT,
- Src.getOperand(0));
+ return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, DL, VT, Src.getOperand(0));
// Fold (sext_inreg (shl X, Y), i32) -> (sllw X, Y) iff Y u< 32
if (Opc == ISD::SHL && Subtarget.is64Bit() && SrcVT == MVT::i32 &&
VT == MVT::i64 && !isa<ConstantSDNode>(Src.getOperand(1)) &&
DAG.computeKnownBits(Src.getOperand(1)).countMaxActiveBits() <= 5)
- return DAG.getNode(RISCVISD::SLLW, SDLoc(N), VT, Src.getOperand(0),
+ return DAG.getNode(RISCVISD::SLLW, DL, VT, Src.getOperand(0),
Src.getOperand(1));
+ // Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc))
+ if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG())
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+
+ // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1)
+ if (Opc == ISD::XOR && SrcVT == MVT::i1 &&
+ isAllOnesConstant(Src.getOperand(1)) &&
+ Src.getOperand(0).getOpcode() == ISD::SETCC && DCI.isAfterLegalizeDAG())
+ return DAG.getNode(ISD::ADD, DL, VT, Src.getOperand(0),
+ DAG.getAllOnesConstant(DL, VT));
+
return SDValue();
}
@@ -17461,7 +17546,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
return SDValue();
SmallVector<SDNode *> Worklist;
- SmallSet<SDNode *, 8> Inserted;
+ SmallPtrSet<SDNode *, 8> Inserted;
Worklist.push_back(N);
Inserted.insert(N);
SmallVector<CombineResult> CombinesToApply;
@@ -20022,9 +20107,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
case ISD::SETCC:
- return performSETCCCombine(N, DAG, Subtarget);
+ return performSETCCCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG:
- return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+ return performSIGN_EXTEND_INREGCombine(N, DCI, Subtarget);
case ISD::ZERO_EXTEND:
// Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during
// type legalization. This is safe because fp_to_uint produces poison if
@@ -20580,10 +20665,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
// Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1.
// vfmv.f.s is represented as extract element from 0. Match it late to avoid
// any illegal types.
- if (Val.getOpcode() == RISCVISD::VMV_X_S ||
- (DCI.isAfterLegalizeDAG() &&
- Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isNullConstant(Val.getOperand(1)))) {
+ if ((Val.getOpcode() == RISCVISD::VMV_X_S ||
+ (DCI.isAfterLegalizeDAG() &&
+ Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isNullConstant(Val.getOperand(1)))) &&
+ Val.hasOneUse()) {
SDValue Src = Val.getOperand(0);
MVT VecVT = Src.getSimpleValueType();
// VecVT should be scalable and memory VT should match the element type.
@@ -20673,12 +20759,22 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
isNullConstant(Src.getOperand(1)) &&
Src.getOperand(0).getValueType().isScalableVector()) {
EVT VT = N->getValueType(0);
- EVT SrcVT = Src.getOperand(0).getValueType();
- assert(SrcVT.getVectorElementType() == VT.getVectorElementType());
+ SDValue EVSrc = Src.getOperand(0);
+ EVT EVSrcVT = EVSrc.getValueType();
+ assert(EVSrcVT.getVectorElementType() == VT.getVectorElementType());
// Widths match, just return the original vector.
- if (SrcVT == VT)
- return Src.getOperand(0);
- // TODO: Use insert_subvector/extract_subvector to change widen/narrow?
+ if (EVSrcVT == VT)
+ return EVSrc;
+ SDLoc DL(N);
+ // Width is narrower, using insert_subvector.
+ if (EVSrcVT.getVectorMinNumElements() < VT.getVectorMinNumElements()) {
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ EVSrc,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ }
+ // Width is wider, using extract_subvector.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, EVSrc,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
}
[[fallthrough]];
}
@@ -22270,20 +22366,12 @@ void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
RISCVCCAssignFn Fn) const {
- FunctionType *FType = MF.getFunction().getFunctionType();
-
for (const auto &[Idx, In] : enumerate(Ins)) {
MVT ArgVT = In.VT;
ISD::ArgFlagsTy ArgFlags = In.Flags;
- Type *ArgTy = nullptr;
- if (IsRet)
- ArgTy = FType->getReturnType();
- else if (In.isOrigArg())
- ArgTy = FType->getParamType(In.getOrigArgIndex());
-
if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
- ArgTy)) {
+ In.OrigTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type "
<< ArgVT << '\n');
llvm_unreachable(nullptr);
@@ -22298,10 +22386,9 @@ void RISCVTargetLowering::analyzeOutputArgs(
for (const auto &[Idx, Out] : enumerate(Outs)) {
MVT ArgVT = Out.VT;
ISD::ArgFlagsTy ArgFlags = Out.Flags;
- Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr;
if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet,
- OrigTy)) {
+ Out.OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type "
<< ArgVT << "\n");
llvm_unreachable(nullptr);
@@ -23083,7 +23170,7 @@ bool RISCVTargetLowering::CanLowerReturn(
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
- /*IsRet=*/true, nullptr))
+ /*IsRet=*/true, Outs[i].OrigTy))
return false;
}
return true;
@@ -23343,6 +23430,12 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
&RISCV::VRN2M4RegClass}) {
if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
return std::make_pair(0U, RC);
+
+ if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ if (TRI->isTypeLegalForClass(*RC, ContainerVT))
+ return std::make_pair(0U, RC);
+ }
}
} else if (Constraint == "vd") {
for (const auto *RC :
@@ -23356,10 +23449,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
&RISCV::VRN2M4NoV0RegClass}) {
if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
return std::make_pair(0U, RC);
+
+ if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ if (TRI->isTypeLegalForClass(*RC, ContainerVT))
+ return std::make_pair(0U, RC);
+ }
}
} else if (Constraint == "vm") {
if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy))
return std::make_pair(0U, &RISCV::VMV0RegClass);
+
+ if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VT);
+ // VT here might be coerced to vector with i8 elements, so we need to
+ // check if this is a M1 register here instead of checking VMV0RegClass.
+ if (TRI->isTypeLegalForClass(RISCV::VRRegClass, ContainerVT))
+ return std::make_pair(0U, &RISCV::VMV0RegClass);
+ }
} else if (Constraint == "cr") {
if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin())
return std::make_pair(0U, &RISCV::GPRF16CRegClass);
@@ -24237,7 +24344,12 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
return true;
}
- if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+ if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) &&
+ PartVT.isScalableVector()) {
+ if (ValueVT.isFixedLengthVector()) {
+ ValueVT = getContainerForFixedLengthVector(ValueVT.getSimpleVT());
+ Val = convertToScalableVector(ValueVT, Val, DAG, Subtarget);
+ }
LLVMContext &Context = *DAG.getContext();
EVT ValueEltVT = ValueVT.getVectorElementType();
EVT PartEltVT = PartVT.getVectorElementType();
@@ -24307,12 +24419,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
return Val;
}
- if (ValueVT.isScalableVector() && PartVT.isScalableVector()) {
+ if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) &&
+ PartVT.isScalableVector()) {
LLVMContext &Context = *DAG.getContext();
SDValue Val = Parts[0];
EVT ValueEltVT = ValueVT.getVectorElementType();
EVT PartEltVT = PartVT.getVectorElementType();
unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinValue();
+ if (ValueVT.isFixedLengthVector())
+ ValueVTBitSize = getContainerForFixedLengthVector(ValueVT.getSimpleVT())
+ .getSizeInBits()
+ .getKnownMinValue();
unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinValue();
if (PartVTBitSize % ValueVTBitSize == 0) {
assert(PartVTBitSize >= ValueVTBitSize);
@@ -24330,7 +24447,10 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true);
Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val);
}
- Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0);
+ if (ValueVT.isFixedLengthVector())
+ Val = convertFromScalableVector(ValueVT, Val, DAG, Subtarget);
+ else
+ Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0);
return Val;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 433b8be..fb63ebc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -431,8 +431,8 @@ public:
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index d9c6101..878a0ec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -261,6 +261,12 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
// Indicates the EEW of a vector instruction's destination operand.
EEW DestEEW = EEWSEWx1;
let TSFlags{25-24} = DestEEW.Value;
+
+ // Some vector instructions like vslidedown/vrgather will read elements past
+ // VL, and should be marked to make sure RISCVVLOptimizer doesn't reduce its
+ // operands' VLs.
+ bit ReadsPastVL = 0;
+ let TSFlags{26} = ReadsPastVL;
}
class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8bd3830..836a2b1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1694,6 +1694,16 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> {
valty:$truev, valty:$falsev), []>;
}
+let Predicates = [IsRV32] in {
+def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
+}
+let Predicates = [IsRV64] in {
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
+ (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
+def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
+def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
+}
+
/// Branches and jumps
// Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
@@ -2367,6 +2377,7 @@ include "RISCVInstrInfoXqccmp.td"
include "RISCVInstrInfoXMips.td"
include "RISCVInstrInfoXRivos.td"
include "RISCVInstrInfoXAndes.td"
+include "RISCVInstrInfoXSpacemiT.td"
//===----------------------------------------------------------------------===//
// Global ISel
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 8297d50..1e22c2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -18,7 +18,26 @@
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
-def simm10 : RISCVSImmLeafOp<10>;
+def simm10 : RISCVSImmOp<10>;
+
+def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
+ let RenderMethod = "addSImm8UnsignedOperands";
+}
+
+// A 8-bit signed immediate allowing range [-128, 255]
+// but represented as [-128, 255].
+def simm8_unsigned : RISCVOp {
+ let ParserMatchClass = SImm8UnsignedAsmOperand;
+ let EncoderMethod = "getImmOpValue";
+ let DecoderMethod = "decodeSImmOperand<8>";
+ let OperandType = "OPERAND_SIMM10";
+ let MCOperandPredicate = [{
+ int64_t Imm;
+ if (!MCOp.evaluateAsConstantImm(Imm))
+ return false;
+ return isInt<8>(Imm);
+ }];
+}
def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> {
let RenderMethod = "addSImm10UnsignedOperands";
@@ -43,49 +62,40 @@ def simm10_unsigned : RISCVOp {
// Instruction class templates
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLI_i<bits<7> funct7, string opcodestr>
- : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [],
+// Common base for pli.b/h/w and plui.h/w
+class RVPLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
+ string argstr>
+ : RVInst<(outs GPR:$rd), ins, opcodestr, argstr, [],
InstFormatOther> {
- bits<10> imm10;
bits<5> rd;
let Inst{31-25} = funct7;
- let Inst{24-16} = imm10{8-0};
- let Inst{15} = imm10{9};
let Inst{14-12} = 0b010;
let Inst{11-7} = rd;
let Inst{6-0} = OPC_OP_IMM_32.Value;
+
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLUI_i<bits<7> funct7, string opcodestr>
- : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr,
- "$rd, $imm10", [], InstFormatOther> {
+// Base for pli.h/w.
+class PLI_i<bits<7> funct7, string opcodestr>
+ : RVPLoadImm_i<funct7, (ins simm10:$imm10), opcodestr, "$rd, $imm10"> {
bits<10> imm10;
- bits<5> rd;
- let Inst{31-25} = funct7;
- let Inst{24} = imm10{0};
- let Inst{23-15} = imm10{9-1};
- let Inst{14-12} = 0b010;
- let Inst{11-7} = rd;
- let Inst{6-0} = OPC_OP_IMM_32.Value;
+ let Inst{24-16} = imm10{8-0};
+ let Inst{15} = imm10{9};
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class PLI_B_i<bits<8> funct8, string opcodestr>
- : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
- InstFormatOther> {
- bits<8> uimm8;
- bits<5> rd;
+// Base for plui.h/w.
+class PLUI_i<bits<7> funct7, string opcodestr>
+ : RVPLoadImm_i<funct7, (ins simm10_unsigned:$imm10), opcodestr,
+ "$rd, $imm10"> {
+ bits<10> imm10;
- let Inst{31-24} = funct8;
- let Inst{23-16} = uimm8;
- let Inst{15} = 0b0;
- let Inst{14-12} = 0b010;
- let Inst{11-7} = rd;
- let Inst{6-0} = OPC_OP_IMM_32.Value;
+ let Inst{24} = imm10{0};
+ let Inst{23-15} = imm10{9-1};
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -98,6 +108,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
let Inst{27} = 0b0;
}
+class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm6> {
+ bits<6> shamt;
+
+ let Inst{26} = 0b1;
+ let Inst{25-20} = shamt;
+}
+
class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
: RVPShift_ri<f, funct3, opcodestr, uimm5> {
bits<5> shamt;
@@ -131,59 +149,477 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
let Inst{24-20} = uf;
}
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPBinaryScalar_rr<bits<3> f, bits<2> w, bits<3> funct3, string opcodestr>
+ : RVInstRBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+ (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+ let Inst{31} = 0b1;
+ let Inst{30-28} = f;
+ let Inst{27} = 0b1;
+ let Inst{26-25} = w;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
+ : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd),
+ (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+ let Inst{31} = 0b1;
+ let Inst{30-27} = f;
+ let Inst{26-25} = w;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
+ : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb),
+ (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr,
+ "$rd, $rs1, $rs2"> {
+ let Inst{31} = 0b1;
+ let Inst{30-27} = f;
+ let Inst{26-25} = w;
+
+ let Constraints = "$rd = $rd_wb";
+}
+
+// Common base for pli.db/h/w and plui.dh/w
+class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
+ string argstr>
+ : RVInst<(outs GPRPairRV32:$rd), ins, opcodestr, argstr, [],
+ InstFormatOther> {
+ bits<5> rd;
+
+ let Inst{31-25} = funct7;
+ let Inst{14-12} = 0b010;
+ let Inst{11-8} = rd{4-1};
+ let Inst{7} = 0b0;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtP] in {
-let IsSignExtendingOpW = 1 in
-def CLS : Unary_r<0b011000000011, 0b001, "cls">;
-def ABS : Unary_r<0b011000000111, 0b001, "abs">;
+ let IsSignExtendingOpW = 1 in
+ def CLS : Unary_r<0b011000000011, 0b001, "cls">;
+ def ABS : Unary_r<0b011000000111, 0b001, "abs">;
} // Predicates = [HasStdExtP]
-let Predicates = [HasStdExtP, IsRV32] in
-def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">;
+
+let Predicates = [HasStdExtP, IsRV32] in {
+ def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">;
+} // Predicates = [HasStdExtP, IsRV32]
let Predicates = [HasStdExtP, IsRV64] in {
-def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
-def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
+ def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
+ def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
-let IsSignExtendingOpW = 1 in {
-def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
-def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
-}
+ let IsSignExtendingOpW = 1 in {
+ def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
+ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
+ }
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
-def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
-def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
+ def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+ def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+ def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
-def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
+ def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+ def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
def PLI_H : PLI_i<0b1011000, "pli.h">;
let Predicates = [HasStdExtP, IsRV64] in
def PLI_W : PLI_i<0b1011001, "pli.w">;
-let Predicates = [HasStdExtP] in
-def PLI_B : PLI_B_i<0b10110100, "pli.b">;
+let Predicates = [HasStdExtP] in {
+ def PLI_B : RVPLoadImm_i<0b1011010, (ins simm8_unsigned:$imm8), "pli.b",
+ "$rd, $imm8"> {
+ bits<8> imm8;
+
+ let Inst{24} = 0b0;
+ let Inst{23-16} = imm8;
+ let Inst{15} = 0b0;
+ }
+}
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
+ def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+ def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+ def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
+ def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+ def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
def PLUI_H : PLUI_i<0b1111000, "plui.h">;
let Predicates = [HasStdExtP, IsRV64] in
def PLUI_W : PLUI_i<0b1111001, "plui.w">;
+
+let Predicates = [HasStdExtP] in {
+ def PSLL_HS : RVPBinaryScalar_rr<0b000, 0b00, 0b010, "psll.hs">;
+ def PSLL_BS : RVPBinaryScalar_rr<0b000, 0b10, 0b010, "psll.bs">;
+
+ def PADD_HS : RVPBinaryScalar_rr<0b001, 0b00, 0b010, "padd.hs">;
+ def PADD_BS : RVPBinaryScalar_rr<0b001, 0b10, 0b010, "padd.bs">;
+
+ def PSSHA_HS : RVPBinaryScalar_rr<0b110, 0b00, 0b010, "pssha.hs">;
+
+ def PSSHAR_HS : RVPBinaryScalar_rr<0b111, 0b00, 0b010, "psshar.hs">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def SSHA : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "ssha">;
+
+ def SSHAR : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "sshar">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PSLL_WS : RVPBinaryScalar_rr<0b000, 0b01, 0b010, "psll.ws">;
+
+ def PADD_WS : RVPBinaryScalar_rr<0b001, 0b01, 0b010, "padd.ws">;
+
+ def PSSHA_WS : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "pssha.ws">;
+ def SHA : RVPBinaryScalar_rr<0b110, 0b11, 0b010, "sha">;
+
+ def PSSHAR_WS : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "psshar.ws">;
+ def SHAR : RVPBinaryScalar_rr<0b111, 0b11, 0b010, "shar">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+ def PSRLI_B : RVPShiftB_ri<0b000, 0b100, "psrli.b">;
+ def PSRLI_H : RVPShiftH_ri<0b000, 0b100, "psrli.h">;
+
+ def PUSATI_H : RVPShiftH_ri<0b010, 0b100, "pusati.h">;
+
+ def PSRAI_B : RVPShiftB_ri<0b100, 0b100, "psrai.b">;
+ def PSRAI_H : RVPShiftH_ri<0b100, 0b100, "psrai.h">;
+
+ def PSRARI_H : RVPShiftH_ri<0b101, 0b100, "psrari.h">;
+
+ def PSATI_H : RVPShiftH_ri<0b110, 0b100, "psati.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">;
+
+ def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">;
+
+ def SATI_RV32 : RVPShiftW_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PSRLI_W : RVPShiftW_ri<0b000, 0b100, "psrli.w">;
+ def PSRAI_W : RVPShiftW_ri<0b100, 0b100, "psrai.w">;
+
+ def PUSATI_W : RVPShiftW_ri<0b010, 0b100, "pusati.w">;
+ def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">;
+
+ def PSRARI_W : RVPShiftW_ri<0b101, 0b100, "psrari.w">;
+ def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">;
+
+ def PSATI_W : RVPShiftW_ri<0b110, 0b100, "psati.w">;
+ def SATI_RV64 : RVPShiftD_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+ def PSRL_HS : RVPBinaryScalar_rr<0b000, 0b00, 0b100, "psrl.hs">;
+ def PSRL_BS : RVPBinaryScalar_rr<0b000, 0b10, 0b100, "psrl.bs">;
+
+ def PREDSUM_HS : RVPBinaryScalar_rr<0b001, 0b00, 0b100, "predsum.hs">;
+ def PREDSUM_BS : RVPBinaryScalar_rr<0b001, 0b10, 0b100, "predsum.bs">;
+
+ def PREDSUMU_HS : RVPBinaryScalar_rr<0b011, 0b00, 0b100, "predsumu.hs">;
+ def PREDSUMU_BS : RVPBinaryScalar_rr<0b011, 0b10, 0b100, "predsumu.bs">;
+
+ def PSRA_HS : RVPBinaryScalar_rr<0b100, 0b00, 0b100, "psra.hs">;
+ def PSRA_BS : RVPBinaryScalar_rr<0b100, 0b10, 0b100, "psra.bs">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PSRL_WS : RVPBinaryScalar_rr<0b000, 0b01, 0b100, "psrl.ws">;
+
+ def PREDSUM_WS : RVPBinaryScalar_rr<0b001, 0b01, 0b100, "predsum.ws">;
+
+ def PREDSUMU_WS : RVPBinaryScalar_rr<0b011, 0b01, 0b100, "predsumu.ws">;
+
+ def PSRA_WS : RVPBinaryScalar_rr<0b100, 0b01, 0b100, "psra.ws">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+ def PADD_H : RVPBinary_rr<0b0000, 0b00, 0b000, "padd.h">;
+ def PADD_B : RVPBinary_rr<0b0000, 0b10, 0b000, "padd.b">;
+
+ def PSADD_H : RVPBinary_rr<0b0010, 0b00, 0b000, "psadd.h">;
+ def PSADD_B : RVPBinary_rr<0b0010, 0b10, 0b000, "psadd.b">;
+
+ def PAADD_H : RVPBinary_rr<0b0011, 0b00, 0b000, "paadd.h">;
+ def PAADD_B : RVPBinary_rr<0b0011, 0b10, 0b000, "paadd.b">;
+
+ def PSADDU_H : RVPBinary_rr<0b0110, 0b00, 0b000, "psaddu.h">;
+ def PSADDU_B : RVPBinary_rr<0b0110, 0b10, 0b000, "psaddu.b">;
+
+ def PAADDU_H : RVPBinary_rr<0b0111, 0b00, 0b000, "paaddu.h">;
+ def PAADDU_B : RVPBinary_rr<0b0111, 0b10, 0b000, "paaddu.b">;
+
+ def PSUB_H : RVPBinary_rr<0b1000, 0b00, 0b000, "psub.h">;
+ def PSUB_B : RVPBinary_rr<0b1000, 0b10, 0b000, "psub.b">;
+
+ def PDIF_H : RVPBinary_rr<0b1001, 0b00, 0b000, "pdif.h">;
+ def PDIF_B : RVPBinary_rr<0b1001, 0b10, 0b000, "pdif.b">;
+
+ def PSSUB_H : RVPBinary_rr<0b1010, 0b00, 0b000, "pssub.h">;
+ def PSSUB_B : RVPBinary_rr<0b1010, 0b10, 0b000, "pssub.b">;
+
+ def PASUB_H : RVPBinary_rr<0b1011, 0b00, 0b000, "pasub.h">;
+ def PASUB_B : RVPBinary_rr<0b1011, 0b10, 0b000, "pasub.b">;
+
+ def PDIFU_H : RVPBinary_rr<0b1101, 0b00, 0b000, "pdifu.h">;
+ def PDIFU_B : RVPBinary_rr<0b1101, 0b10, 0b000, "pdifu.b">;
+
+ def PSSUBU_H : RVPBinary_rr<0b1110, 0b00, 0b000, "pssubu.h">;
+ def PSSUBU_B : RVPBinary_rr<0b1110, 0b10, 0b000, "pssubu.b">;
+
+ def PASUBU_H : RVPBinary_rr<0b1111, 0b00, 0b000, "pasubu.h">;
+ def PASUBU_B : RVPBinary_rr<0b1111, 0b10, 0b000, "pasubu.b">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def SADD : RVPBinary_rr<0b0010, 0b01, 0b000, "sadd">;
+
+ def AADD : RVPBinary_rr<0b0011, 0b01, 0b000, "aadd">;
+
+ def SADDU : RVPBinary_rr<0b0110, 0b01, 0b000, "saddu">;
+
+ def AADDU : RVPBinary_rr<0b0111, 0b01, 0b000, "aaddu">;
+
+ def SSUB : RVPBinary_rr<0b1010, 0b01, 0b000, "ssub">;
+
+ def ASUB : RVPBinary_rr<0b1011, 0b01, 0b000, "asub">;
+
+ def SSUBU : RVPBinary_rr<0b1110, 0b01, 0b000, "ssubu">;
+
+ def ASUBU : RVPBinary_rr<0b1111, 0b01, 0b000, "asubu">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PADD_W : RVPBinary_rr<0b0000, 0b01, 0b000, "padd.w">;
+
+ def PSADD_W : RVPBinary_rr<0b0010, 0b01, 0b000, "psadd.w">;
+
+ def PAADD_W : RVPBinary_rr<0b0011, 0b01, 0b000, "paadd.w">;
+
+ def PSADDU_W : RVPBinary_rr<0b0110, 0b01, 0b000, "psaddu.w">;
+
+ def PAADDU_W : RVPBinary_rr<0b0111, 0b01, 0b000, "paaddu.w">;
+
+ def PSUB_W : RVPBinary_rr<0b1000, 0b01, 0b000, "psub.w">;
+
+ def PSSUB_W : RVPBinary_rr<0b1010, 0b01, 0b000, "pssub.w">;
+
+ def PASUB_W : RVPBinary_rr<0b1011, 0b01, 0b000, "pasub.w">;
+
+ def PSSUBU_W : RVPBinary_rr<0b1110, 0b01, 0b000, "pssubu.w">;
+
+ def PASUBU_W : RVPBinary_rr<0b1111, 0b01, 0b000, "pasubu.w">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+ def SLX : RVPBinary_rr<0b0001, 0b11, 0b001, "slx">;
+
+ def PMUL_H_B01 : RVPBinary_rr<0b0010, 0b00, 0b001, "pmul.h.b01">;
+
+ def MVM : RVPTernary_rrr<0b0101, 0b00, 0b001, "mvm">;
+ def MVMN : RVPTernary_rrr<0b0101, 0b01, 0b001, "mvmn">;
+ def MERGE : RVPTernary_rrr<0b0101, 0b10, 0b001, "merge">;
+ def SRX : RVPTernary_rrr<0b0101, 0b11, 0b001, "srx">;
+
+ def PMULU_H_B01 : RVPBinary_rr<0b0110, 0b00, 0b001, "pmulu.h.b01">;
+ def PDIFSUMU_B : RVPBinary_rr<0b0110, 0b10, 0b001, "pdifsumu.b">;
+
+ def PDIFSUMAU_B : RVPTernary_rrr<0b0111, 0b10, 0b001, "pdifsumau.b">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def MUL_H01 : RVPBinary_rr<0b0010, 0b01, 0b001, "mul.h01">;
+
+ def MACC_H01 : RVPTernary_rrr<0b0011, 0b01, 0b001, "macc.h01">;
+
+ def MULU_H01 : RVPBinary_rr<0b0110, 0b01, 0b001, "mulu.h01">;
+
+ def MACCU_H01 : RVPTernary_rrr<0b0111, 0b01, 0b001, "maccu.h01">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PMUL_W_H01 : RVPBinary_rr<0b0010, 0b01, 0b001, "pmul.w.h01">;
+ def MUL_W01 : RVPBinary_rr<0b0010, 0b11, 0b001, "mul.w01">;
+
+ def PMACC_W_H01 : RVPTernary_rrr<0b0011, 0b01, 0b001, "pmacc.w.h01">;
+ def MACC_W01 : RVPTernary_rrr<0b0011, 0b11, 0b001, "macc.w01">;
+
+ def PMULU_W_H01 : RVPBinary_rr<0b0110, 0b01, 0b001, "pmulu.w.h01">;
+ def MULU_W01 : RVPBinary_rr<0b0110, 0b11, 0b001, "mulu.w01">;
+
+ def PMACCU_W_H01 : RVPTernary_rrr<0b0111, 0b01, 0b001, "pmaccu.w.h01">;
+ def MACCU_W01 : RVPTernary_rrr<0b0111, 0b11, 0b001, "maccu.w01">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27.
+// Here we include the 0 in the f field to reduce number of tablegen classes.
+let Predicates = [HasStdExtP] in {
+ def PSH1ADD_H : RVPBinary_rr<0b0100, 0b00, 0b010, "psh1add.h">;
+
+ def PSSH1SADD_H : RVPBinary_rr<0b0110, 0b00, 0b010, "pssh1sadd.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def SSH1SADD : RVPBinary_rr<0b0110, 0b01, 0b010, "ssh1sadd">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PSH1ADD_W : RVPBinary_rr<0b0100, 0b01, 0b010, "psh1add.w">;
+
+ def PSSH1SADD_W : RVPBinary_rr<0b0110, 0b01, 0b010, "pssh1sadd.w">;
+
+ def UNZIP8P : RVPBinary_rr<0b1100, 0b00, 0b010, "unzip8p">;
+ def UNZIP16P : RVPBinary_rr<0b1100, 0b01, 0b010, "unzip16p">;
+ def UNZIP8HP : RVPBinary_rr<0b1100, 0b10, 0b010, "unzip8hp">;
+ def UNZIP16HP : RVPBinary_rr<0b1100, 0b11, 0b010, "unzip16hp">;
+
+ def ZIP8P : RVPBinary_rr<0b1110, 0b00, 0b010, "zip8p">;
+ def ZIP16P : RVPBinary_rr<0b1110, 0b01, 0b010, "zip16p">;
+ def ZIP8HP : RVPBinary_rr<0b1110, 0b10, 0b010, "zip8hp">;
+ def ZIP16HP : RVPBinary_rr<0b1110, 0b11, 0b010, "zip16hp">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP] in {
+ def PMUL_H_B00 : RVPBinary_rr<0b0000, 0b00, 0b011, "pmul.h.b00">;
+
+ def PMUL_H_B11 : RVPBinary_rr<0b0010, 0b00, 0b011, "pmul.h.b11">;
+
+ def PMULU_H_B00 : RVPBinary_rr<0b0100, 0b00, 0b011, "pmulu.h.b00">;
+
+ def PMULU_H_B11 : RVPBinary_rr<0b0110, 0b00, 0b011, "pmulu.h.b11">;
+
+ def PMULSU_H_B00 : RVPBinary_rr<0b1100, 0b00, 0b011, "pmulsu.h.b00">;
+
+ def PMULSU_H_B11 : RVPBinary_rr<0b1110, 0b00, 0b011, "pmulsu.h.b11">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def MUL_H00 : RVPBinary_rr<0b0000, 0b01, 0b011, "mul.h00">;
+
+ def MACC_H00 : RVPTernary_rrr<0b0001, 0b01, 0b011, "macc.h00">;
+
+ def MUL_H11 : RVPBinary_rr<0b0010, 0b01, 0b011, "mul.h11">;
+
+ def MACC_H11 : RVPTernary_rrr<0b0011, 0b01, 0b011, "macc.h11">;
+
+ def MULU_H00 : RVPBinary_rr<0b0100, 0b01, 0b011, "mulu.h00">;
+
+ def MACCU_H00 : RVPTernary_rrr<0b0101, 0b01, 0b011, "maccu.h00">;
+
+ def MULU_H11 : RVPBinary_rr<0b0110, 0b01, 0b011, "mulu.h11">;
+
+ def MACCU_H11 : RVPTernary_rrr<0b0111, 0b01, 0b011, "maccu.h11">;
+
+ def MULSU_H00 : RVPBinary_rr<0b1100, 0b01, 0b011, "mulsu.h00">;
+
+ def MACCSU_H00 : RVPTernary_rrr<0b1101, 0b01, 0b011, "maccsu.h00">;
+
+ def MULSU_H11 : RVPBinary_rr<0b1110, 0b01, 0b011, "mulsu.h11">;
+
+ def MACCSU_H11 : RVPTernary_rrr<0b1111, 0b01, 0b011, "maccsu.h11">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PMUL_W_H00 : RVPBinary_rr<0b0000, 0b01, 0b011, "pmul.w.h00">;
+ def MUL_W00 : RVPBinary_rr<0b0000, 0b11, 0b011, "mul.w00">;
+
+ def PMACC_W_H00 : RVPTernary_rrr<0b0001, 0b01, 0b011, "pmacc.w.h00">;
+ def MACC_W00 : RVPTernary_rrr<0b0001, 0b11, 0b011, "macc.w00">;
+
+ def PMUL_W_H11 : RVPBinary_rr<0b0010, 0b01, 0b011, "pmul.w.h11">;
+ def MUL_W11 : RVPBinary_rr<0b0010, 0b11, 0b011, "mul.w11">;
+
+ def PMACC_W_H11 : RVPTernary_rrr<0b0011, 0b01, 0b011, "pmacc.w.h11">;
+ def MACC_W11 : RVPTernary_rrr<0b0011, 0b11, 0b011, "macc.w11">;
+
+ def PMULU_W_H00 : RVPBinary_rr<0b0100, 0b01, 0b011, "pmulu.w.h00">;
+ def MULU_W00 : RVPBinary_rr<0b0100, 0b11, 0b011, "mulu.w00">;
+
+ def PMACCU_W_H00 : RVPTernary_rrr<0b0101, 0b01, 0b011, "pmaccu.w.h00">;
+ def MACCU_W00 : RVPTernary_rrr<0b0101, 0b11, 0b011, "maccu.w00">;
+
+ def PMULU_W_H11 : RVPBinary_rr<0b0110, 0b01, 0b011, "pmulu.w.h11">;
+ def MULU_W11 : RVPBinary_rr<0b0110, 0b11, 0b011, "mulu.w11">;
+
+ def PMACCU_W_H11 : RVPTernary_rrr<0b0111, 0b01, 0b011, "pmaccu.w.h11">;
+ def MACCU_W11 : RVPTernary_rrr<0b0111, 0b11, 0b011, "maccu.w11">;
+
+ def PMULSU_W_H00 : RVPBinary_rr<0b1100, 0b01, 0b011, "pmulsu.w.h00">;
+ def MULSU_W00 : RVPBinary_rr<0b1100, 0b11, 0b011, "mulsu.w00">;
+
+ def PMACCSU_W_H00 : RVPTernary_rrr<0b1101, 0b01, 0b011, "pmaccsu.w.h00">;
+ def MACCSU_W00 : RVPTernary_rrr<0b1101, 0b11, 0b011, "maccsu.w00">;
+
+ def PMULSU_W_H11 : RVPBinary_rr<0b1110, 0b01, 0b011, "pmulsu.w.h11">;
+ def MULSU_W11 : RVPBinary_rr<0b1110, 0b11, 0b011, "mulsu.w11">;
+
+ def PMACCSU_W_H11 : RVPTernary_rrr<0b1111, 0b01, 0b011, "pmaccsu.w.h11">;
+ def MACCSU_W11 : RVPTernary_rrr<0b1111, 0b11, 0b011, "maccsu.w11">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27.
+// Here we include the 0 in the f field to reduce number of tablegen classes.
+let Predicates = [HasStdExtP] in {
+ def PPACK_H : RVPBinary_rr<0b0000, 0b00, 0b100, "ppack.h">;
+
+ def PPACKBT_H : RVPBinary_rr<0b0010, 0b00, 0b100, "ppackbt.h">;
+
+ def PPACKTB_H : RVPBinary_rr<0b0100, 0b00, 0b100, "ppacktb.h">;
+
+ def PPACKT_H : RVPBinary_rr<0b0110, 0b00, 0b100, "ppackt.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def PACKBT_RV32 : RVPBinary_rr<0b0010, 0b01, 0b100, "packbt">;
+
+ def PACKTB_RV32 : RVPBinary_rr<0b0100, 0b01, 0b100, "packtb">;
+
+ def PACKT_RV32 : RVPBinary_rr<0b0110, 0b01, 0b100, "packt">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PPACK_W : RVPBinary_rr<0b0000, 0b01, 0b100, "ppack.w">;
+
+ def PPACKBT_W : RVPBinary_rr<0b0010, 0b01, 0b100, "ppackbt.w">;
+ def PACKBT_RV64 : RVPBinary_rr<0b0010, 0b11, 0b100, "packbt">;
+
+ def PPACKTB_W : RVPBinary_rr<0b0100, 0b01, 0b100, "ppacktb.w">;
+ def PACKTB_RV64 : RVPBinary_rr<0b0100, 0b11, 0b100, "packtb">;
+
+ def PPACKT_W : RVPBinary_rr<0b0110, 0b01, 0b100, "ppackt.w">;
+ def PACKT_RV64 : RVPBinary_rr<0b0110, 0b11, 0b100, "packt">;
+} // Predicates = [HasStdExtP, IsRV64]
+
+let Predicates = [HasStdExtP, IsRV32] in {
+ def PLI_DH : RVPPairLoadImm_i<0b0011000, (ins simm10:$imm10), "pli.dh",
+ "$rd, $imm10"> {
+ bits<10> imm10;
+
+ let Inst{24-16} = imm10{8-0};
+ let Inst{15} = imm10{9};
+ }
+
+ def PLI_DB : RVPPairLoadImm_i<0b0011010, (ins simm8_unsigned:$imm8), "pli.db",
+ "$rd, $imm8"> {
+ bits<8> imm8;
+
+ let Inst{24} = 0b0;
+ let Inst{23-16} = imm8;
+ let Inst{15} = 0b0;
+ }
+
+ def PLUI_DH : RVPPairLoadImm_i<0b0111000, (ins simm10_unsigned:$imm10),
+ "plui.dh", "$rd, $imm10"> {
+ bits<10> imm10;
+
+ let Inst{24} = imm10{0};
+ let Inst{23-15} = imm10{9-1};
+ }
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 33c7138..cebab21 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1703,8 +1703,9 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, /*slidesUp=*/true>;
defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
+let ReadsPastVL = 1 in
defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, /*slidesUp=*/false>;
-let ElementsDependOn = EltDepsVL in
+let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in
defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
} // Predicates = [HasVInstructions]
@@ -1712,19 +1713,19 @@ let Predicates = [HasVInstructionsAnyF] in {
let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-let ElementsDependOn = EltDepsVL in
+let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in
defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
} // Predicates = [HasVInstructionsAnyF]
let Predicates = [HasVInstructions] in {
// Vector Register Gather Instruction
-let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1 in {
defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100>;
def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
SchedBinaryMC<"WriteVRGatherEI16VV",
"ReadVRGatherEI16VV_data",
"ReadVRGatherEI16VV_index">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1
// Vector Compress Instruction
let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ElementsDependOn = EltDepsVLMask in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index c75addd9..1fb30a0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -420,7 +420,7 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
}
class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
- : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm),
+ : RVInst<(outs VR:$vd), (ins VR:$vs2),
opcodestr, "$vd, $vs2", [], InstFormatR> {
bits<5> vs2;
bits<5> vd;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 0c8487c..889ea98 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -129,20 +129,20 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr>
// MIPS extensions
//===----------------------------------------------------------------------===//
let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in {
- def MIPS_PREFETCH : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
- "mips.pref", "$hint, ${imm9}(${rs1})">,
- Sched<[]>;
+ def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
+ "mips.pref", "$hint, ${imm9}(${rs1})">,
+ Sched<[]>;
}
let Predicates = [HasVendorXMIPSCBOP] in {
// Prefetch Data Write.
def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9),
(i32 1), timm, (i32 1)),
- (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 9)>;
+ (MIPS_PREF GPR:$rs1, uimm9:$imm9, 9)>;
// Prefetch Data Read.
def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9),
(i32 0), timm, (i32 1)),
- (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 8)>;
+ (MIPS_PREF GPR:$rs1, uimm9:$imm9, 8)>;
}
let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index ebcf079..3a6ce3c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -58,7 +58,7 @@ class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Predicates = [HasVendorXRivosVizip], DecoderNamespace = "XRivos",
Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather,
- Inst<6-0> = OPC_CUSTOM_2.Value in {
+ Inst<6-0> = OPC_CUSTOM_2.Value, ReadsPastVL = 1 in {
defm RI_VZIPEVEN_V : VALU_IV_V<"ri.vzipeven", 0b001100>;
defm RI_VZIPODD_V : VALU_IV_V<"ri.vzipodd", 0b011100>;
defm RI_VZIP2A_V : VALU_IV_V<"ri.vzip2a", 0b000100>;
@@ -126,6 +126,7 @@ def RI_VINSERT : CustomRivosVXI<0b010000, OPMVX, (outs VR:$vd_wb),
(ins VR:$vd, GPR:$rs1, uimm5:$imm),
"ri.vinsert.v.x", "$vd, $rs1, $imm">;
+let ReadsPastVL = 1 in
def RI_VEXTRACT : CustomRivosXVI<0b010111, OPMVV, (outs GPR:$rd),
(ins VR:$vs2, uimm5:$imm),
"ri.vextract.x.v", "$rd, $vs2, $imm">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index a47dfe3..b546339 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -74,6 +74,7 @@ class RVInstVCCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
let Uses = [VL, VTYPE];
let RVVConstraint = NoConstraint;
let ElementsDependOn = EltDepsVLMask;
+ let ReadsPastVL = 1;
}
class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
@@ -98,6 +99,7 @@ class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
let Uses = [VL, VTYPE];
let RVVConstraint = NoConstraint;
let ElementsDependOn = EltDepsVLMask;
+ let ReadsPastVL = 1;
}
class VCIXInfo<string suffix, VCIXType type, DAGOperand TyRd,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index 66cb2d5..a5ee701 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -65,6 +65,7 @@ class SFInstTileMemOp<dag outs, dag ins, bits<3> nf, RISCVOpcode opcode,
let Inst{6-0} = opcode.Value;
let Uses = [VTYPE, VL];
+ let ReadsPastVL = 1;
}
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
@@ -94,6 +95,7 @@ class SFInstTileMoveOp<bits<6> funct6, dag outs, dag ins, string opcodestr,
let Inst{6-0} = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let ReadsPastVL = 1;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -113,6 +115,7 @@ class SFInstMatmulF<dag outs, dag ins, string opcodestr, string argstr>
let Inst{6-0} = OPC_OP_VE.Value;
let Uses = [VTYPE, VL];
+ let ReadsPastVL = 1;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -135,6 +138,7 @@ class SFInstMatmulF8<bit a, bit b, dag outs, dag ins,
let Inst{6-0} = OPC_OP_VE.Value;
let Uses = [VTYPE, VL];
+ let ReadsPastVL = 1;
}
@@ -167,6 +171,7 @@ class SFInstMatmulI8<bit funct6_1, bit a, bit b, dag outs, dag ins,
let Inst{6-0} = OPC_OP_VE.Value;
let Uses = [VTYPE, VL];
+ let ReadsPastVL = 1;
}
class I8Encode<bit encoding, string name> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td
new file mode 100644
index 0000000..980931e
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td
@@ -0,0 +1,139 @@
+//===-- RISCVInstrInfoXSpacemiT.td -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the vendor extensions defined by SpacemiT.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand definitions.
+//===----------------------------------------------------------------------===//
+
+class SMTVDotOpcode<bits<7> val> {
+ bits<7> Value = val;
+}
+
+class SMTVEncoding2<bits<2> val> {
+ bits<2> Value = val;
+}
+
+def OPMMA : SMTVDotOpcode<0b1110001>;
+def OPMMA_SLIDE : SMTVDotOpcode<0b1110011>;
+
+//===----------------------------------------------------------------------===//
+// Vector Dot-Product Sign Encoding
+// Defines the signed/unsigned mixing modes for vector dot-product operations.
+// Encoding format: [1:0] bits
+// 00: UU (Unsigned x Unsigned)
+// 01: US (Unsigned x Signed)
+// 10: SU (Signed x Unsigned)
+// 11: SS (Signed x Signed)
+//===----------------------------------------------------------------------===//
+def SMT_VDot_UU : SMTVEncoding2<0b00>;
+def SMT_VDot_US : SMTVEncoding2<0b01>;
+def SMT_VDot_SU : SMTVEncoding2<0b10>;
+def SMT_VDot_SS : SMTVEncoding2<0b11>;
+
+//===----------------------------------------------------------------------===//
+// Vector Dot-Product Sliding Window Modes
+// Encoding format: [1:0] bits
+// 00: Slide1 (1-element sliding stride)
+// 01: Slide2 (2-element sliding stride)
+// 10: Slide3 (3-element sliding stride)
+// 11: Reserved
+//
+// Used in sliding-window dot-product operations:
+// vd = vs1 • vs2.slide{1|2|3} // • = dot product
+//===----------------------------------------------------------------------===//
+def SMT_VDot_Slide1 : SMTVEncoding2<0b00>;
+def SMT_VDot_Slide2 : SMTVEncoding2<0b01>;
+def SMT_VDot_Slide3 : SMTVEncoding2<0b10>;
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// Base vector dot product (no slide) format.
+class RVInstSMTVDot<SMTVEncoding2 sign, string opcodestr, string argstr>
+ : RVInst<(outs VRM2:$vd), (ins VR:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> {
+ bits<5> vd;
+ bits<5> vs1;
+ bits<5> vs2;
+
+ let Inst{31-25} = OPMMA.Value;
+ let Inst{24-20} = vs2;
+ let Inst{19-15} = vs1;
+ let Inst{14} = 0b0;
+ let Inst{13-12} = sign.Value;
+ let Inst{11-8} = vd{4-1};
+ let Inst{7} = 0b0;
+ let Inst{6-0} = OPC_CUSTOM_1.Value;
+}
+
+// Sliding-window vector dot product format.
+class RVInstSMTVDotSlide<SMTVEncoding2 funct2, SMTVEncoding2 sign, string opcodestr, string argstr>
+ : RVInst<(outs VRM2:$vd), (ins VRM2:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> {
+ bits<5> vd;
+ bits<5> vs1;
+ bits<5> vs2;
+
+ let Inst{31-25} = OPMMA_SLIDE.Value;
+ let Inst{24-20} = vs2;
+ let Inst{19-16} = vs1{4-1};
+ let Inst{15-14} = funct2.Value;
+ let Inst{13-12} = sign.Value;
+ let Inst{11-8} = vd{4-1};
+ let Inst{7} = 0b0;
+ let Inst{6-0} = OPC_CUSTOM_1.Value;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "XSMT" in {
+
+let Predicates = [HasVendorXSMTVDot], ElementsDependOn = EltDepsVL in {
+// Base vector dot product (no slide) instructions
+// NOTE: Destination registers (vd) MUST be even-numbered (v0, v2, ..., v30)
+// due to hardware alignment constraints. Using odd registers may cause undefined behavior.
+def VMADOT : RVInstSMTVDot<SMT_VDot_SS, "smt.vmadot", "$vd, $vs1, $vs2">;
+def VMADOTU : RVInstSMTVDot<SMT_VDot_UU, "smt.vmadotu", "$vd, $vs1, $vs2">;
+def VMADOTSU : RVInstSMTVDot<SMT_VDot_SU, "smt.vmadotsu", "$vd, $vs1, $vs2">;
+def VMADOTUS : RVInstSMTVDot<SMT_VDot_US, "smt.vmadotus", "$vd, $vs1, $vs2">;
+
+//===----------------------------------------------------------------------===//
+// Sliding-window Vector Dot Product Instructions
+//
+// The numeric suffix (1, 2, 3) specifies the stride of the sliding window:
+// 1: Window slides by 1 element per operation
+// 2: Window slides by 2 elements per operation
+// 3: Window slides by 3 elements per operation
+//
+// These instructions compute dot products with overlapping operand windows
+// where the window position increments by <N> elements between computations.
+//===----------------------------------------------------------------------===//
+// NOTE: Destination registers (vd) and first source register (vs1) MUST be
+// even-numbered (v0, v2, ..., v30) due to hardware alignment constraints.
+// Using odd registers may cause undefined behavior.
+def VMADOT1 : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SS, "smt.vmadot1", "$vd, $vs1, $vs2">;
+def VMADOT1U : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_UU, "smt.vmadot1u", "$vd, $vs1, $vs2">;
+def VMADOT1SU : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SU, "smt.vmadot1su", "$vd, $vs1, $vs2">;
+def VMADOT1US : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_US, "smt.vmadot1us", "$vd, $vs1, $vs2">;
+def VMADOT2 : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SS, "smt.vmadot2", "$vd, $vs1, $vs2">;
+def VMADOT2U : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_UU, "smt.vmadot2u", "$vd, $vs1, $vs2">;
+def VMADOT2SU : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SU, "smt.vmadot2su", "$vd, $vs1, $vs2">;
+def VMADOT2US : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_US, "smt.vmadot2us", "$vd, $vs1, $vs2">;
+def VMADOT3 : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SS, "smt.vmadot3", "$vd, $vs1, $vs2">;
+def VMADOT3U : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_UU, "smt.vmadot3u", "$vd, $vs1, $vs2">;
+def VMADOT3SU : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SU, "smt.vmadot3su", "$vd, $vs1, $vs2">;
+def VMADOT3US : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_US, "smt.vmadot3us", "$vd, $vs1, $vs2">;
+}
+} \ No newline at end of file
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 413ad8b..a31afaa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -692,6 +692,21 @@ def : Pat<(binop_allwusers<or>
(shl GPR:$op1rs1, (XLenVT 24))),
(shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
(PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+
+def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
+ (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+ (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))),
+ (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+
+// Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
+// bits [31:0] coming from a zero extended value. We can use pack with packw for
+// bits [63:32]. If bits [63:31] can also be a packw, it can be matched
+// separately.
+def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)),
+ (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))),
+ (zexti32 (i64 GPR:$rs1))),
+ (PACK (XLenVT GPR:$rs1),
+ (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>;
} // Predicates = [HasStdExtZbkb, IsRV64]
let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 726920e..c7b96f5 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
Intrinsic::riscv_seg8_load_mask};
+static const Intrinsic::ID FixedVlssegIntrIds[] = {
+ Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
+ Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
+ Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
+ Intrinsic::riscv_sseg8_load_mask};
+
static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -197,9 +203,15 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Indices.size() == Shuffles.size());
+ assert(GapMask.getBitWidth() == Factor);
+ // We only support cases where the skipped fields are the trailing ones.
+ // TODO: Lower to strided load if there is only a single active field.
+ unsigned MaskFactor = GapMask.popcount();
+ if (MaskFactor < 2 || !GapMask.isMask())
+ return false;
IRBuilder<> Builder(Load);
const DataLayout &DL = Load->getDataLayout();
@@ -208,20 +220,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Ptr, *VL;
Align Alignment;
- if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+ if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
- if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;
- CallInst *VlsegN = Builder.CreateIntrinsic(
- FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ CallInst *SegLoad = nullptr;
+ if (MaskFactor < Factor) {
+ // Lower to strided segmented load.
+ unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+ SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+ {VTy, PtrTy, XLenTy, XLenTy},
+ {Ptr, Stride, Mask, VL});
+ } else {
+ // Lower to normal segmented load.
+ SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+ {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ }
for (unsigned i = 0; i < Shuffles.size(); i++) {
- Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
- Shuffles[i]->replaceAllUsesWith(SubVec);
+ unsigned FactorIdx = Indices[i];
+ if (FactorIdx >= MaskFactor) {
+ // Replace masked-off factors (that are still extracted) with poison.
+ Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
+ } else {
+ Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
+ Shuffles[i]->replaceAllUsesWith(SubVec);
+ }
}
return true;
diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
index 7a2541a..d234dcf 100644
--- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -26,6 +26,7 @@ struct RISCVMoveMerge : public MachineFunctionPass {
RISCVMoveMerge() : MachineFunctionPass(ID) {}
+ const RISCVSubtarget *ST;
const RISCVInstrInfo *TII;
const TargetRegisterInfo *TRI;
@@ -37,15 +38,15 @@ struct RISCVMoveMerge : public MachineFunctionPass {
// Merge the two instructions indicated into a single pair instruction.
MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Paired, unsigned Opcode);
+ MachineBasicBlock::iterator Paired, bool MoveFromSToA);
// Look for C.MV instruction that can be combined with
// the given instruction into CM.MVA01S or CM.MVSA01. Return the matching
// instruction if one exists.
MachineBasicBlock::iterator
- findMatchingInst(MachineBasicBlock::iterator &MBBI, unsigned InstOpcode,
+ findMatchingInst(MachineBasicBlock::iterator &MBBI, bool MoveFromSToA,
const DestSourcePair &RegPair);
- bool mergeMoveSARegPair(const RISCVSubtarget &STI, MachineBasicBlock &MBB);
+ bool mergeMoveSARegPair(MachineBasicBlock &MBB);
bool runOnMachineFunction(MachineFunction &Fn) override;
StringRef getPassName() const override { return RISCV_MOVE_MERGE_NAME; }
@@ -58,41 +59,21 @@ char RISCVMoveMerge::ID = 0;
INITIALIZE_PASS(RISCVMoveMerge, "riscv-move-merge", RISCV_MOVE_MERGE_NAME,
false, false)
-static bool isMoveFromAToS(unsigned Opcode) {
- switch (Opcode) {
- case RISCV::CM_MVA01S:
- case RISCV::QC_CM_MVA01S:
- return true;
- default:
- return false;
- }
-}
-
-static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &STI) {
- if (STI.hasStdExtZcmp())
+static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &ST) {
+ if (ST.hasStdExtZcmp())
return RISCV::CM_MVA01S;
- if (STI.hasVendorXqccmp())
+ if (ST.hasVendorXqccmp())
return RISCV::QC_CM_MVA01S;
llvm_unreachable("Unhandled subtarget with paired A to S move.");
}
-static bool isMoveFromSToA(unsigned Opcode) {
- switch (Opcode) {
- case RISCV::CM_MVSA01:
- case RISCV::QC_CM_MVSA01:
- return true;
- default:
- return false;
- }
-}
-
-static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &STI) {
- if (STI.hasStdExtZcmp())
+static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &ST) {
+ if (ST.hasStdExtZcmp())
return RISCV::CM_MVSA01;
- if (STI.hasVendorXqccmp())
+ if (ST.hasVendorXqccmp())
return RISCV::QC_CM_MVSA01;
llvm_unreachable("Unhandled subtarget with paired S to A move");
@@ -123,20 +104,24 @@ bool RISCVMoveMerge::isCandidateToMergeMVSA01(const DestSourcePair &RegPair) {
MachineBasicBlock::iterator
RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned Opcode) {
+ bool MoveFromSToA) {
const MachineOperand *Sreg1, *Sreg2;
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
DestSourcePair FirstPair = TII->isCopyInstrImpl(*I).value();
DestSourcePair PairedRegs = TII->isCopyInstrImpl(*Paired).value();
- Register ARegInFirstPair = isMoveFromAToS(Opcode)
- ? FirstPair.Destination->getReg()
- : FirstPair.Source->getReg();
+ Register ARegInFirstPair = MoveFromSToA ? FirstPair.Destination->getReg()
+ : FirstPair.Source->getReg();
if (NextI == Paired)
NextI = next_nodbg(NextI, E);
DebugLoc DL = I->getDebugLoc();
+ // Make a copy so we can update the kill flag in the MoveFromSToA case. The
+ // copied operand needs to be scoped outside the if since we make a pointer
+ // to it.
+ MachineOperand PairedSource = *PairedRegs.Source;
+
// The order of S-reg depends on which instruction holds A0, instead of
// the order of register pair.
// e,g.
@@ -146,10 +131,20 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
// mv a0, s2
// mv a1, s1 => cm.mva01s s2,s1
bool StartWithX10 = ARegInFirstPair == RISCV::X10;
- if (isMoveFromAToS(Opcode)) {
- Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
- Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+ unsigned Opcode;
+ if (MoveFromSToA) {
+ // We are moving one of the copies earlier so its kill flag may become
+ // invalid. Clear the copied kill flag if there are any reads of the
+ // register between the new location and the old location.
+ for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+ if (It->readsRegister(PairedSource.getReg(), TRI))
+ PairedSource.setIsKill(false);
+
+ Opcode = getMoveFromSToAOpcode(*ST);
+ Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+ Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
} else {
+ Opcode = getMoveFromAToSOpcode(*ST);
Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
}
@@ -163,7 +158,7 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator
RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
- unsigned InstOpcode,
+ bool MoveFromSToA,
const DestSourcePair &RegPair) {
MachineBasicBlock::iterator E = MBBI->getParent()->end();
@@ -181,7 +176,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
Register SourceReg = SecondPair->Source->getReg();
Register DestReg = SecondPair->Destination->getReg();
- if (isMoveFromAToS(InstOpcode) && isCandidateToMergeMVA01S(*SecondPair)) {
+ if (MoveFromSToA && isCandidateToMergeMVA01S(*SecondPair)) {
// If register pair is valid and destination registers are different.
if ((RegPair.Destination->getReg() == DestReg))
return E;
@@ -195,8 +190,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
return E;
return I;
- } else if (isMoveFromSToA(InstOpcode) &&
- isCandidateToMergeMVSA01(*SecondPair)) {
+ } else if (!MoveFromSToA && isCandidateToMergeMVSA01(*SecondPair)) {
if ((RegPair.Source->getReg() == SourceReg) ||
(RegPair.Destination->getReg() == DestReg))
return E;
@@ -217,8 +211,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI,
// Finds instructions, which could be represented as C.MV instructions and
// merged into CM.MVA01S or CM.MVSA01.
-bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI,
- MachineBasicBlock &MBB) {
+bool RISCVMoveMerge::mergeMoveSARegPair(MachineBasicBlock &MBB) {
bool Modified = false;
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -227,22 +220,17 @@ bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI,
// can, return Dest/Src register pair.
auto RegPair = TII->isCopyInstrImpl(*MBBI);
if (RegPair.has_value()) {
- unsigned Opcode = 0;
-
- if (isCandidateToMergeMVA01S(*RegPair))
- Opcode = getMoveFromAToSOpcode(STI);
- else if (isCandidateToMergeMVSA01(*RegPair))
- Opcode = getMoveFromSToAOpcode(STI);
- else {
+ bool MoveFromSToA = isCandidateToMergeMVA01S(*RegPair);
+ if (!MoveFromSToA && !isCandidateToMergeMVSA01(*RegPair)) {
++MBBI;
continue;
}
MachineBasicBlock::iterator Paired =
- findMatchingInst(MBBI, Opcode, RegPair.value());
+ findMatchingInst(MBBI, MoveFromSToA, RegPair.value());
// If matching instruction can be found merge them.
if (Paired != E) {
- MBBI = mergePairedInsns(MBBI, Paired, Opcode);
+ MBBI = mergePairedInsns(MBBI, Paired, MoveFromSToA);
Modified = true;
continue;
}
@@ -256,12 +244,12 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
- const RISCVSubtarget *Subtarget = &Fn.getSubtarget<RISCVSubtarget>();
- if (!(Subtarget->hasStdExtZcmp() || Subtarget->hasVendorXqccmp()))
+ ST = &Fn.getSubtarget<RISCVSubtarget>();
+ if (!ST->hasStdExtZcmp() && !ST->hasVendorXqccmp())
return false;
- TII = Subtarget->getInstrInfo();
- TRI = Subtarget->getRegisterInfo();
+ TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
// Resize the modified and used register unit trackers. We do this once
// per function and then clear the register units each time we optimize a
// move.
@@ -269,7 +257,7 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) {
UsedRegUnits.init(*TRI);
bool Modified = false;
for (auto &MBB : Fn)
- Modified |= mergeMoveSARegPair(*Subtarget, MBB);
+ Modified |= mergeMoveSARegPair(MBB);
return Modified;
}
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 31d2b3a..f89d94f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -673,6 +673,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
FeatureStdExtZvfh,
FeatureStdExtZvkt,
FeatureStdExtZvl256b,
+ FeatureVendorXSMTVDot,
FeatureUnalignedScalarMem]),
[TuneDLenFactor2,
TuneOptimizedNF2SegmentLoadStore,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 7e58b6f..8a3c8e2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -589,7 +589,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
(Lo12 & 0b11111) != 0) {
// Prefetch instructions require the offset to be 32 byte aligned.
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
- } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) {
+ } else if (Opc == RISCV::MIPS_PREF && !isUInt<9>(Val)) {
// MIPS Prefetch instructions require the offset to be 9 bits encoded.
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
} else if ((Opc == RISCV::PseudoRV32ZdinxLD ||
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index 5ef858a..8cf15fa 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -24,7 +24,7 @@ let SchedModel = Andes45Model in {
//===----------------------------------------------------------------------===//
// Andes 45 series CPU
-// - 2 Interger Arithmetic and Logical Units (ALU)
+// - 2 Integer Arithmetic and Logical Units (ALU)
// - Multiply / Divide Unit (MDU)
// - Load Store Unit (LSU)
// - Control and Status Register Unit (CSR)
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 5541506..24ebbc3 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -524,16 +524,33 @@ foreach mx = SchedMxListW in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c;
+ // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c;
+ // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites
+ let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 13. Vector Floating-Point Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 66ce134..c70571c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -38,7 +38,6 @@
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
#include <optional>
using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 67f924a..c707fb1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1431,7 +1431,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
case Intrinsic::ctlz:
case Intrinsic::ctpop: {
auto LT = getTypeLegalizationCost(RetTy);
- if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
+ if (ST->hasStdExtZvbb() && LT.second.isVector()) {
unsigned Op;
switch (ICA.getID()) {
case Intrinsic::cttz:
@@ -1629,6 +1629,7 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
// scalarized if the legalized Src and Dst are not equal sized.
const DataLayout &DL = this->getDataLayout();
if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
+ !SrcLT.first.isValid() || !DstLT.first.isValid() ||
!TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
SrcLT.second.getSizeInBits()) ||
!TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
@@ -2414,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return BaseCost + SlideCost;
}
+InstructionCost
+RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+ TTI::TargetCostKind CostKind,
+ unsigned Index) const {
+ if (isa<FixedVectorType>(Val))
+ return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
+ Index);
+
+ // TODO: This code replicates what LoopVectorize.cpp used to do when asking
+ // for the cost of extracting the last lane of a scalable vector. It probably
+ // needs a more accurate cost.
+ ElementCount EC = cast<VectorType>(Val)->getElementCount();
+ assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
+ return getVectorInstrCost(Opcode, Val, CostKind,
+ EC.getKnownMinValue() - 1 - Index, nullptr,
+ nullptr);
+}
+
InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 05d504c..b632f25 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
bool enableScalableVectorization() const override {
return ST->hasVInstructions();
}
+ bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override {
+ return ST->hasVInstructions();
+ }
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
@@ -240,6 +243,11 @@ public:
unsigned Index, const Value *Op0,
const Value *Op1) const override;
+ InstructionCost
+ getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+ TTI::TargetCostKind CostKind,
+ unsigned Index) const override;
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 37a71e8..f973e75 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -491,8 +491,42 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
// vfirst find-first-set mask bit
case RISCV::VCPOP_M:
case RISCV::VFIRST_M:
+ // Vector Bit-manipulation Instructions (Zvbb)
+ // Vector And-Not
+ case RISCV::VANDN_VV:
+ case RISCV::VANDN_VX:
+ // Vector Reverse Bits in Elements
+ case RISCV::VBREV_V:
+ // Vector Reverse Bits in Bytes
+ case RISCV::VBREV8_V:
+ // Vector Reverse Bytes
+ case RISCV::VREV8_V:
+ // Vector Count Leading Zeros
+ case RISCV::VCLZ_V:
+ // Vector Count Trailing Zeros
+ case RISCV::VCTZ_V:
+ // Vector Population Count
+ case RISCV::VCPOP_V:
+ // Vector Rotate Left
+ case RISCV::VROL_VV:
+ case RISCV::VROL_VX:
+ // Vector Rotate Right
+ case RISCV::VROR_VI:
+ case RISCV::VROR_VV:
+ case RISCV::VROR_VX:
+ // Vector Carry-less Multiplication Instructions (Zvbc)
+ // Vector Carry-less Multiply
+ case RISCV::VCLMUL_VV:
+ case RISCV::VCLMUL_VX:
+ // Vector Carry-less Multiply Return High Half
+ case RISCV::VCLMULH_VV:
+ case RISCV::VCLMULH_VX:
return MILog2SEW;
+ // Vector Widening Shift Left Logical (Zvbb)
+ case RISCV::VWSLL_VI:
+ case RISCV::VWSLL_VX:
+ case RISCV::VWSLL_VV:
// Vector Widening Integer Add/Subtract
// Def uses EEW=2*SEW . Operands use EEW=SEW.
case RISCV::VWADDU_VV:
@@ -503,9 +537,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VWADD_VX:
case RISCV::VWSUB_VV:
case RISCV::VWSUB_VX:
- case RISCV::VWSLL_VI:
- case RISCV::VWSLL_VX:
- case RISCV::VWSLL_VV:
// Vector Widening Integer Multiply Instructions
// Destination EEW=2*SEW. Source EEW=SEW.
case RISCV::VWMUL_VV:
@@ -1020,12 +1051,40 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VNCLIP_WV:
case RISCV::VNCLIP_WX:
case RISCV::VNCLIP_WI:
-
- // Vector Crypto
+ // Vector Bit-manipulation Instructions (Zvbb)
+ // Vector And-Not
+ case RISCV::VANDN_VV:
+ case RISCV::VANDN_VX:
+ // Vector Reverse Bits in Elements
+ case RISCV::VBREV_V:
+ // Vector Reverse Bits in Bytes
+ case RISCV::VBREV8_V:
+ // Vector Reverse Bytes
+ case RISCV::VREV8_V:
+ // Vector Count Leading Zeros
+ case RISCV::VCLZ_V:
+ // Vector Count Trailing Zeros
+ case RISCV::VCTZ_V:
+ // Vector Population Count
+ case RISCV::VCPOP_V:
+ // Vector Rotate Left
+ case RISCV::VROL_VV:
+ case RISCV::VROL_VX:
+ // Vector Rotate Right
+ case RISCV::VROR_VI:
+ case RISCV::VROR_VV:
+ case RISCV::VROR_VX:
+ // Vector Widening Shift Left Logical
case RISCV::VWSLL_VI:
case RISCV::VWSLL_VX:
case RISCV::VWSLL_VV:
-
+ // Vector Carry-less Multiplication Instructions (Zvbc)
+ // Vector Carry-less Multiply
+ case RISCV::VCLMUL_VV:
+ case RISCV::VCLMUL_VX:
+ // Vector Carry-less Multiply Return High Half
+ case RISCV::VCLMULH_VV:
+ case RISCV::VCLMULH_VX:
// Vector Mask Instructions
// Vector Mask-Register Logical Instructions
// vmsbf.m set-before-first mask bit
@@ -1213,34 +1272,6 @@ static bool isVectorOpUsedAsScalarOp(const MachineOperand &MO) {
}
}
-/// Return true if MI may read elements past VL.
-static bool mayReadPastVL(const MachineInstr &MI) {
- const RISCVVPseudosTable::PseudoInfo *RVV =
- RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
- if (!RVV)
- return true;
-
- switch (RVV->BaseInstr) {
- // vslidedown instructions may read elements past VL. They are handled
- // according to current tail policy.
- case RISCV::VSLIDEDOWN_VI:
- case RISCV::VSLIDEDOWN_VX:
- case RISCV::VSLIDE1DOWN_VX:
- case RISCV::VFSLIDE1DOWN_VF:
-
- // vrgather instructions may read the source vector at any index < VLMAX,
- // regardless of VL.
- case RISCV::VRGATHER_VI:
- case RISCV::VRGATHER_VV:
- case RISCV::VRGATHER_VX:
- case RISCV::VRGATHEREI16_VV:
- return true;
-
- default:
- return false;
- }
-}
-
bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
const MCInstrDesc &Desc = MI.getDesc();
if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags))
@@ -1301,7 +1332,8 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
return std::nullopt;
}
- if (mayReadPastVL(UserMI)) {
+ if (RISCVII::readsPastVL(
+ TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n");
return std::nullopt;
}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 0ed97f5..d6b6079 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -38,8 +38,15 @@ struct CapabilityEntry {
Capability::Capability ReqCapability;
};
+struct EnvironmentEntry {
+ OperandCategory::OperandCategory Category;
+ uint32_t Value;
+ Environment::Environment AllowedEnvironment;
+};
+
using namespace OperandCategory;
using namespace Extension;
+using namespace Environment;
using namespace Capability;
using namespace InstructionSet;
#define GET_SymbolicOperands_DECL
@@ -48,6 +55,8 @@ using namespace InstructionSet;
#define GET_ExtensionEntries_IMPL
#define GET_CapabilityEntries_DECL
#define GET_CapabilityEntries_IMPL
+#define GET_EnvironmentEntries_DECL
+#define GET_EnvironmentEntries_IMPL
#define GET_ExtendedBuiltins_DECL
#define GET_ExtendedBuiltins_IMPL
#include "SPIRVGenTables.inc"
@@ -133,6 +142,23 @@ getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
return Capabilities;
}
+EnvironmentList getSymbolicOperandAllowedEnvironments(
+ SPIRV::OperandCategory::OperandCategory Category, uint32_t Value) {
+ EnvironmentList Environments;
+ const SPIRV::EnvironmentEntry *Environment =
+ SPIRV::lookupEnvironmentByCategoryAndValue(Category, Value);
+ auto TableEnd = ArrayRef(SPIRV::EnvironmentEntries).end();
+ while (Environment && Environment->Category == Category &&
+ Environment->Value == Value) {
+ Environments.push_back(static_cast<SPIRV::Environment::Environment>(
+ Environment->AllowedEnvironment));
+ if (++Environment == TableEnd)
+ break;
+ }
+
+ return Environments;
+}
+
CapabilityList
getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension) {
const SPIRV::ExtensionEntry *Entry =
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index b8c467f..c2c08f8 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -37,6 +37,11 @@ namespace Capability {
#include "SPIRVGenTables.inc"
} // namespace Capability
+namespace Environment {
+#define GET_Environment_DECL
+#include "SPIRVGenTables.inc"
+} // namespace Environment
+
namespace SourceLanguage {
#define GET_SourceLanguage_DECL
#include "SPIRVGenTables.inc"
@@ -241,6 +246,7 @@ enum InstFlags {
using CapabilityList = SmallVector<SPIRV::Capability::Capability, 8>;
using ExtensionList = SmallVector<SPIRV::Extension::Extension, 8>;
+using EnvironmentList = SmallVector<SPIRV::Environment::Environment, 8>;
std::string
getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category,
@@ -254,6 +260,8 @@ getSymbolicOperandMaxVersion(SPIRV::OperandCategory::OperandCategory Category,
CapabilityList
getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
uint32_t Value);
+EnvironmentList getSymbolicOperandAllowedEnvironments(
+ SPIRV::OperandCategory::OperandCategory Category, uint32_t Value);
CapabilityList
getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension);
ExtensionList
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 4ec31bf..1e3f7fc 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -375,9 +375,17 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg())
O << '%' << (getIDFromRegister(Op.getReg().id()) + 1);
- else if (Op.isImm())
- O << formatImm(Op.getImm());
- else if (Op.isDFPImm())
+ else if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ // For OpVectorShuffle:
+ // A Component literal may also be FFFFFFFF, which means the corresponding
+ // result component has no source and is undefined.
+ // LLVM representation of poison/undef becomes -1 when lowered to MI.
+ if (MI->getOpcode() == SPIRV::OpVectorShuffle && Imm == -1)
+ O << "0xFFFFFFFF";
+ else
+ O << formatImm(Imm);
+ } else if (Op.isDFPImm())
O << formatImm((double)Op.getDFPImm());
else if (Op.isExpr())
MAI.printExpr(O, *Op.getExpr());
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index d9265f4..5a5860a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -12,7 +12,8 @@
//===----------------------------------------------------------------------===//
#include "SPIRVCommandLine.h"
-#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/TargetParser/Triple.h"
#include <algorithm>
#include <map>
@@ -171,3 +172,23 @@ StringRef SPIRVExtensionsParser::checkExtensions(
}
return StringRef();
}
+
+std::set<SPIRV::Extension::Extension>
+SPIRVExtensionsParser::getValidExtensions(const Triple &TT) {
+ std::set<SPIRV::Extension::Extension> R;
+ SPIRV::Environment::Environment CurrentEnvironment =
+ SPIRV::Environment::Environment::EnvOpenCL;
+ if (TT.getOS() == Triple::Vulkan)
+ CurrentEnvironment = SPIRV::Environment::Environment::EnvVulkan;
+
+ for (const auto &[ExtensionName, ExtensionEnum] : SPIRVExtensionMap) {
+ EnvironmentList AllowedEnv = getSymbolicOperandAllowedEnvironments(
+ SPIRV::OperandCategory::OperandCategory::ExtensionOperand,
+ ExtensionEnum);
+
+ if (std::count(AllowedEnv.begin(), AllowedEnv.end(), CurrentEnvironment))
+ R.insert(ExtensionEnum);
+ }
+
+ return R;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
index 3e3b22b..02e847b3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h
@@ -21,6 +21,7 @@
namespace llvm {
class StringRef;
+class Triple;
/// Command line parser for toggling SPIR-V extensions.
struct SPIRVExtensionsParser
@@ -42,6 +43,11 @@ public:
static StringRef
checkExtensions(const std::vector<std::string> &ExtNames,
std::set<SPIRV::Extension::Extension> &AllowedExtensions);
+
+ /// Returns the list of extensions that are valid for a particular
+ /// target environment (i.e., OpenCL or Vulkan).
+ static std::set<SPIRV::Extension::Extension>
+ getValidExtensions(const Triple &TT);
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 5259db1..98c7709 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -220,8 +220,10 @@ private:
bool selectConst(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
- bool selectSelect(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
- bool IsSigned) const;
+ bool selectSelect(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+ bool selectSelectDefaultArgs(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I, bool IsSigned) const;
bool selectIToF(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
bool IsSigned, unsigned Opcode) const;
bool selectExt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
@@ -510,7 +512,18 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) {
if (isTypeFoldingSupported(Def->getOpcode()) &&
Def->getOpcode() != TargetOpcode::G_CONSTANT &&
Def->getOpcode() != TargetOpcode::G_FCONSTANT) {
- bool Res = selectImpl(I, *CoverageInfo);
+ bool Res = false;
+ if (Def->getOpcode() == TargetOpcode::G_SELECT) {
+ Register SelectDstReg = Def->getOperand(0).getReg();
+ Res = selectSelect(SelectDstReg, GR.getSPIRVTypeForVReg(SelectDstReg),
+ *Def);
+ GR.invalidateMachineInstr(Def);
+ Def->removeFromParent();
+ MRI->replaceRegWith(DstReg, SelectDstReg);
+ GR.invalidateMachineInstr(&I);
+ I.removeFromParent();
+ } else
+ Res = selectImpl(I, *CoverageInfo);
LLVM_DEBUG({
if (!Res && Def->getOpcode() != TargetOpcode::G_CONSTANT) {
dbgs() << "Unexpected pattern in ASSIGN_TYPE.\nInstruction: ";
@@ -2565,8 +2578,52 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
bool SPIRVInstructionSelector::selectSelect(Register ResVReg,
const SPIRVType *ResType,
- MachineInstr &I,
- bool IsSigned) const {
+ MachineInstr &I) const {
+ Register SelectFirstArg = I.getOperand(2).getReg();
+ Register SelectSecondArg = I.getOperand(3).getReg();
+ assert(ResType == GR.getSPIRVTypeForVReg(SelectFirstArg) &&
+ ResType == GR.getSPIRVTypeForVReg(SelectSecondArg));
+
+ bool IsFloatTy =
+ GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypeFloat);
+ bool IsPtrTy =
+ GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypePointer);
+ bool IsVectorTy = GR.getSPIRVTypeForVReg(SelectFirstArg)->getOpcode() ==
+ SPIRV::OpTypeVector;
+
+ bool IsScalarBool =
+ GR.isScalarOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool);
+ unsigned Opcode;
+ if (IsVectorTy) {
+ if (IsFloatTy) {
+ Opcode = IsScalarBool ? SPIRV::OpSelectVFSCond : SPIRV::OpSelectVFVCond;
+ } else if (IsPtrTy) {
+ Opcode = IsScalarBool ? SPIRV::OpSelectVPSCond : SPIRV::OpSelectVPVCond;
+ } else {
+ Opcode = IsScalarBool ? SPIRV::OpSelectVISCond : SPIRV::OpSelectVIVCond;
+ }
+ } else {
+ if (IsFloatTy) {
+ Opcode = IsScalarBool ? SPIRV::OpSelectSFSCond : SPIRV::OpSelectVFVCond;
+ } else if (IsPtrTy) {
+ Opcode = IsScalarBool ? SPIRV::OpSelectSPSCond : SPIRV::OpSelectVPVCond;
+ } else {
+ Opcode = IsScalarBool ? SPIRV::OpSelectSISCond : SPIRV::OpSelectVIVCond;
+ }
+ }
+ return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(1).getReg())
+ .addUse(SelectFirstArg)
+ .addUse(SelectSecondArg)
+ .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectSelectDefaultArgs(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I,
+ bool IsSigned) const {
// To extend a bool, we need to use OpSelect between constants.
Register ZeroReg = buildZerosVal(ResType, I);
Register OneReg = buildOnesVal(IsSigned, ResType, I);
@@ -2598,7 +2655,7 @@ bool SPIRVInstructionSelector::selectIToF(Register ResVReg,
TmpType = GR.getOrCreateSPIRVVectorType(TmpType, NumElts, I, TII);
}
SrcReg = createVirtualRegister(TmpType, &GR, MRI, MRI->getMF());
- selectSelect(SrcReg, TmpType, I, false);
+ selectSelectDefaultArgs(SrcReg, TmpType, I, false);
}
return selectOpWithSrcs(ResVReg, ResType, I, {SrcReg}, Opcode);
}
@@ -2608,7 +2665,7 @@ bool SPIRVInstructionSelector::selectExt(Register ResVReg,
MachineInstr &I, bool IsSigned) const {
Register SrcReg = I.getOperand(1).getReg();
if (GR.isScalarOrVectorOfType(SrcReg, SPIRV::OpTypeBool))
- return selectSelect(ResVReg, ResType, I, IsSigned);
+ return selectSelectDefaultArgs(ResVReg, ResType, I, IsSigned);
SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg);
if (SrcType == ResType)
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index 0398e52..aea3397 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -15,7 +15,6 @@
#include "SPIRV.h"
#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index b62db7f..1a08c6a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -441,13 +441,10 @@ void insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType,
// Tablegen definition assumes SPIRV::ASSIGN_TYPE pseudo-instruction is
// present after each auto-folded instruction to take a type reference from.
Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
- if (auto *RC = MRI.getRegClassOrNull(Reg)) {
- MRI.setRegClass(NewReg, RC);
- } else {
- auto RegClass = GR->getRegClass(SpvType);
- MRI.setRegClass(NewReg, RegClass);
- MRI.setRegClass(Reg, RegClass);
- }
+ const auto *RegClass = GR->getRegClass(SpvType);
+ MRI.setRegClass(NewReg, RegClass);
+ MRI.setRegClass(Reg, RegClass);
+
GR->assignSPIRVTypeToVReg(SpvType, Reg, MIB.getMF());
// This is to make it convenient for Legalizer to get the SPIRVType
// when processing the actual MI (i.e. not pseudo one).
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 74aec4f..2b34f61 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) {
}
}
-static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
- ArrayRef<unsigned> OpNos) {
- Function *F = nullptr;
- if (OpNos.empty()) {
- F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID);
- } else {
- SmallVector<Type *, 4> Tys;
- for (unsigned OpNo : OpNos)
- Tys.push_back(II->getOperand(OpNo)->getType());
- F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys);
- }
- II->setCalledFunction(F);
+static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) {
+ IRBuilder<> Builder(II);
+ auto *Alloca = cast<AllocaInst>(II->getArgOperand(0));
+ std::optional<TypeSize> Size =
+ Alloca->getAllocationSize(Alloca->getDataLayout());
+ Value *SizeVal = Builder.getInt64(Size ? *Size : -1);
+ Builder.CreateIntrinsic(NewID, Alloca->getType(),
+ {SizeVal, II->getArgOperand(0)});
+ II->eraseFromParent();
return true;
}
@@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
break;
case Intrinsic::lifetime_start:
if (!STI.isShader()) {
- Changed |= toSpvOverloadedIntrinsic(
- II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+ Changed |= toSpvLifetimeIntrinsic(
+ II, Intrinsic::SPVIntrinsics::spv_lifetime_start);
} else {
II->eraseFromParent();
Changed = true;
@@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
break;
case Intrinsic::lifetime_end:
if (!STI.isShader()) {
- Changed |= toSpvOverloadedIntrinsic(
- II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+ Changed |= toSpvLifetimeIntrinsic(
+ II, Intrinsic::SPVIntrinsics::spv_lifetime_end);
} else {
II->eraseFromParent();
Changed = true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cdf3c62..690493fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -166,7 +166,13 @@ void SPIRVSubtarget::initAvailableExtInstSets() {
void SPIRVSubtarget::initAvailableExtensions(
const std::set<SPIRV::Extension::Extension> &AllowedExtIds) {
AvailableExtensions.clear();
- AvailableExtensions.insert_range(AllowedExtIds);
+ const std::set<SPIRV::Extension::Extension> &ValidExtensions =
+ SPIRVExtensionsParser::getValidExtensions(TargetTriple);
+
+ for (const auto &Ext : AllowedExtIds) {
+ if (ValidExtensions.count(Ext))
+ AvailableExtensions.insert(Ext);
+ }
accountForAMDShaderTrinaryMinmax();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 614e83a..d2824ee 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -110,22 +110,58 @@ def CapabilityEntries : GenericTable {
}
//===----------------------------------------------------------------------===//
+// Lookup table for matching symbolic operands (category + 32-bit value) to
+// SPIR-V environments. If an operand is allows in more than one environment,
+// there will be multiple consecutive entries present in the table.
+//===----------------------------------------------------------------------===//
+
+// Forward-declare classes used in ExtensionEntry
+class Environment;
+
+class EnvironmentEntry<OperandCategory category, bits<32> value,
+ Environment allowedEnvironment> {
+ OperandCategory Category = category;
+ bits<32> Value = value;
+ Environment AllowedEnvironment = allowedEnvironment;
+}
+
+def EnvironmentEntries : GenericTable {
+ let FilterClass = "EnvironmentEntry";
+ let Fields = ["Category", "Value", "AllowedEnvironment"];
+ string TypeOf_Category = "OperandCategory";
+ string TypeOf_AllowedEnvironment = "Environment";
+ let PrimaryKey = ["Category", "Value"];
+ // Function for looking up a (the first) environment by category + value. Next
+ // environment should be consecutive.
+ let PrimaryKeyName = "lookupEnvironmentByCategoryAndValue";
+}
+
+//===----------------------------------------------------------------------===//
// Multiclass used to define a SymbolicOperand and at the same time declare
// required extension and capabilities.
//===----------------------------------------------------------------------===//
-multiclass SymbolicOperandWithRequirements<OperandCategory category, bits<32> value, string mnemonic, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
- assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided for symbolic operand with value " # value;
- def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
+multiclass SymbolicOperandWithRequirements<
+ OperandCategory category, bits<32> value, string mnemonic,
+ bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions,
+ list<Capability> reqCapabilities, list<Environment> allowedEnvironments> {
+ assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided "
+ "for symbolic operand with value "#value;
+ def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
+
+ assert !le(!size(reqExtensions), 1),
+ "Too many required extensions for a symbolic/named operand: "#mnemonic;
+ if !eq(!size(reqExtensions), 1) then {
+ def : ExtensionEntry<category, value, reqExtensions[0]>;
+ }
- assert !le(!size(reqExtensions), 1), "Too many required extensions for a symbolic/named operand: " # mnemonic;
- if !eq(!size(reqExtensions), 1) then {
- def : ExtensionEntry<category, value, reqExtensions[0]>;
- }
+ foreach capability = reqCapabilities in {
+ def : CapabilityEntry<category, value, capability>;
+ }
- foreach capability = reqCapabilities in {
- def : CapabilityEntry<category, value, capability>;
- }
+ foreach environment = allowedEnvironments in {
+ def : EnvironmentEntry<category, value, environment>;
+ }
}
//===----------------------------------------------------------------------===//
@@ -176,6 +212,20 @@ def SpecConstantOpOperandsOperand : OperandCategory;
def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
//===----------------------------------------------------------------------===//
+// Definition of the Environments
+//===----------------------------------------------------------------------===//
+
+def Environment : GenericEnum, Operand<i32> {
+ let FilterClass = "Environment";
+ let ValueField = "Value";
+}
+
+class Environment<bits<32> value> { bits<32> Value = value; }
+
+def EnvOpenCL : Environment<0>;
+def EnvVulkan : Environment<1>;
+
+//===----------------------------------------------------------------------===//
// Multiclass used to define Extesions enum values and at the same time
// SymbolicOperand entries.
//===----------------------------------------------------------------------===//
@@ -192,135 +242,146 @@ class Extension<string name, bits<32> value> {
bits<32> Value = value;
}
-multiclass ExtensionOperand<bits<32> value> {
+multiclass ExtensionOperand<bits<32> value,
+ list<Environment> allowedEnvironments> {
def NAME : Extension<NAME, value>;
- defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0, 0, [], []>;
-}
-
-defm SPV_AMD_shader_explicit_vertex_parameter : ExtensionOperand<1>;
-defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2>;
-defm SPV_AMD_gcn_shader : ExtensionOperand<3>;
-defm SPV_KHR_shader_ballot : ExtensionOperand<4>;
-defm SPV_AMD_shader_ballot : ExtensionOperand<5>;
-defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6>;
-defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7>;
-defm SPV_KHR_subgroup_vote : ExtensionOperand<8>;
-defm SPV_KHR_16bit_storage : ExtensionOperand<9>;
-defm SPV_KHR_device_group : ExtensionOperand<10>;
-defm SPV_KHR_multiview : ExtensionOperand<11>;
-defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12>;
-defm SPV_NV_viewport_array2 : ExtensionOperand<13>;
-defm SPV_NV_stereo_view_rendering : ExtensionOperand<14>;
-defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15>;
-defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16>;
-defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17>;
-defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18>;
-defm SPV_KHR_variable_pointers : ExtensionOperand<19>;
-defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20>;
-defm SPV_KHR_post_depth_coverage : ExtensionOperand<21>;
-defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22>;
-defm SPV_EXT_shader_stencil_export : ExtensionOperand<23>;
-defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24>;
-defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25>;
-defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26>;
-defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27>;
-defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28>;
-defm SPV_GOOGLE_decorate_string : ExtensionOperand<29>;
-defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30>;
-defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31>;
-defm SPV_EXT_descriptor_indexing : ExtensionOperand<32>;
-defm SPV_KHR_8bit_storage : ExtensionOperand<33>;
-defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34>;
-defm SPV_NV_ray_tracing : ExtensionOperand<35>;
-defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36>;
-defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37>;
-defm SPV_NV_mesh_shader : ExtensionOperand<38>;
-defm SPV_NV_shader_image_footprint : ExtensionOperand<39>;
-defm SPV_NV_shading_rate : ExtensionOperand<40>;
-defm SPV_INTEL_subgroups : ExtensionOperand<41>;
-defm SPV_INTEL_media_block_io : ExtensionOperand<42>;
-defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44>;
-defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45>;
-defm SPV_KHR_float_controls : ExtensionOperand<46>;
-defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47>;
-defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48>;
-defm SPV_NV_cooperative_matrix : ExtensionOperand<49>;
-defm SPV_INTEL_shader_integer_functions2 : ExtensionOperand<50>;
-defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51>;
-defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52>;
-defm SPV_NV_shader_sm_builtins : ExtensionOperand<53>;
-defm SPV_KHR_shader_clock : ExtensionOperand<54>;
-defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55>;
-defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56>;
-defm SPV_INTEL_fpga_reg : ExtensionOperand<57>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58>;
-defm SPV_GOOGLE_user_type : ExtensionOperand<59>;
-defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60>;
-defm SPV_INTEL_kernel_attributes : ExtensionOperand<61>;
-defm SPV_KHR_non_semantic_info : ExtensionOperand<62>;
-defm SPV_INTEL_io_pipes : ExtensionOperand<63>;
-defm SPV_KHR_ray_tracing : ExtensionOperand<64>;
-defm SPV_KHR_ray_query : ExtensionOperand<65>;
-defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66>;
-defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67>;
-defm SPV_EXT_shader_atomic_float_add : ExtensionOperand<68>;
-defm SPV_KHR_terminate_invocation : ExtensionOperand<69>;
-defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70>;
-defm SPV_EXT_shader_image_int64 : ExtensionOperand<71>;
-defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72>;
-defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73>;
-defm SPV_INTEL_loop_fuse : ExtensionOperand<74>;
-defm SPV_EXT_shader_atomic_float_min_max : ExtensionOperand<75>;
-defm SPV_KHR_workgroup_memory_explicit_layout : ExtensionOperand<76>;
-defm SPV_KHR_linkonce_odr : ExtensionOperand<77>;
-defm SPV_KHR_expect_assume : ExtensionOperand<78>;
-defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79>;
-defm SPV_NV_bindless_texture : ExtensionOperand<80>;
-defm SPV_INTEL_fpga_invocation_pipelining_attributes : ExtensionOperand<81>;
-defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82>;
-defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83>;
-defm SPV_KHR_integer_dot_product : ExtensionOperand<84>;
-defm SPV_EXT_shader_atomic_float16_add : ExtensionOperand<85>;
-defm SPV_INTEL_runtime_aligned : ExtensionOperand<86>;
-defm SPV_KHR_bit_instructions : ExtensionOperand<87>;
-defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88>;
-defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89>;
-defm SPV_KHR_subgroup_rotate : ExtensionOperand<90>;
-defm SPV_INTEL_split_barrier : ExtensionOperand<91>;
-defm SPV_KHR_ray_cull_mask : ExtensionOperand<92>;
-defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93>;
-defm SPV_EXT_relaxed_printf_string_address_space : ExtensionOperand<94>;
-defm SPV_EXT_ycbcr_attachments : ExtensionOperand<95>;
-defm SPV_EXT_mesh_shader : ExtensionOperand<96>;
-defm SPV_ARM_core_builtins : ExtensionOperand<97>;
-defm SPV_EXT_opacity_micromap : ExtensionOperand<98>;
-defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99>;
-defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>;
-defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
-defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
-defm SPV_INTEL_optnone : ExtensionOperand<103>;
-defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
-defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
-defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
-defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
-defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
-defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
-defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
-defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
-defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
-defm SPV_EXT_optnone : ExtensionOperand<113>;
-defm SPV_INTEL_joint_matrix : ExtensionOperand<114>;
-defm SPV_INTEL_float_controls2 : ExtensionOperand<115>;
-defm SPV_INTEL_bindless_images : ExtensionOperand<116>;
-defm SPV_INTEL_long_composites : ExtensionOperand<117>;
-defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>;
-defm SPV_INTEL_fp_max_error : ExtensionOperand<119>;
-defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>;
-defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>;
-defm SPV_INTEL_2d_block_io : ExtensionOperand<122>;
-defm SPV_INTEL_int4 : ExtensionOperand<123>;
-defm SPV_KHR_float_controls2 : ExtensionOperand<124>;
-defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>;
+ defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0,
+ 0, [], [], allowedEnvironments>;
+}
+
+defm SPV_AMD_shader_explicit_vertex_parameter
+ : ExtensionOperand<1, [EnvVulkan]>;
+defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2, [EnvVulkan]>;
+defm SPV_AMD_gcn_shader : ExtensionOperand<3, [EnvVulkan]>;
+defm SPV_KHR_shader_ballot : ExtensionOperand<4, [EnvVulkan]>;
+defm SPV_AMD_shader_ballot : ExtensionOperand<5, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6, [EnvVulkan]>;
+defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7, [EnvVulkan]>;
+defm SPV_KHR_subgroup_vote : ExtensionOperand<8, [EnvVulkan]>;
+defm SPV_KHR_16bit_storage : ExtensionOperand<9, [EnvVulkan]>;
+defm SPV_KHR_device_group : ExtensionOperand<10, [EnvVulkan]>;
+defm SPV_KHR_multiview : ExtensionOperand<11, [EnvVulkan]>;
+defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12, [EnvVulkan]>;
+defm SPV_NV_viewport_array2 : ExtensionOperand<13, [EnvVulkan]>;
+defm SPV_NV_stereo_view_rendering : ExtensionOperand<14, [EnvVulkan]>;
+defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15, [EnvVulkan]>;
+defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16, [EnvVulkan]>;
+defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17, [EnvVulkan]>;
+defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18, [EnvVulkan]>;
+defm SPV_KHR_variable_pointers : ExtensionOperand<19, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20, [EnvVulkan]>;
+defm SPV_KHR_post_depth_coverage : ExtensionOperand<21, [EnvVulkan]>;
+defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22, []>;
+defm SPV_EXT_shader_stencil_export : ExtensionOperand<23, [EnvVulkan]>;
+defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24, [EnvVulkan]>;
+defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25, [EnvVulkan]>;
+defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26, [EnvVulkan]>;
+defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27, [EnvVulkan]>;
+defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28, [EnvVulkan]>;
+defm SPV_GOOGLE_decorate_string : ExtensionOperand<29, [EnvVulkan]>;
+defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30, [EnvVulkan]>;
+defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31, [EnvVulkan]>;
+defm SPV_EXT_descriptor_indexing : ExtensionOperand<32, [EnvVulkan]>;
+defm SPV_KHR_8bit_storage : ExtensionOperand<33, [EnvVulkan]>;
+defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34, [EnvVulkan]>;
+defm SPV_NV_ray_tracing : ExtensionOperand<35, [EnvVulkan]>;
+defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36, [EnvVulkan]>;
+defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37, [EnvVulkan]>;
+defm SPV_NV_mesh_shader : ExtensionOperand<38, [EnvVulkan]>;
+defm SPV_NV_shader_image_footprint : ExtensionOperand<39, [EnvVulkan]>;
+defm SPV_NV_shading_rate : ExtensionOperand<40, [EnvVulkan]>;
+defm SPV_INTEL_subgroups : ExtensionOperand<41, [EnvOpenCL]>;
+defm SPV_INTEL_media_block_io : ExtensionOperand<42, [EnvOpenCL]>;
+defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44, [EnvVulkan]>;
+defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45, [EnvOpenCL]>;
+defm SPV_KHR_float_controls : ExtensionOperand<46, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47, [EnvVulkan]>;
+defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48, [EnvOpenCL]>;
+defm SPV_NV_cooperative_matrix : ExtensionOperand<49, [EnvVulkan]>;
+defm SPV_INTEL_shader_integer_functions2
+ : ExtensionOperand<50, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51, [EnvOpenCL]>;
+defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52, [EnvVulkan]>;
+defm SPV_NV_shader_sm_builtins : ExtensionOperand<53, [EnvVulkan]>;
+defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
+defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
+defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
+defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
+defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
+defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
+defm SPV_KHR_non_semantic_info : ExtensionOperand<62, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_io_pipes : ExtensionOperand<63, [EnvOpenCL]>;
+defm SPV_KHR_ray_tracing : ExtensionOperand<64, [EnvVulkan]>;
+defm SPV_KHR_ray_query : ExtensionOperand<65, [EnvVulkan]>;
+defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66, [EnvOpenCL]>;
+defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67, [EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float_add
+ : ExtensionOperand<68, [EnvVulkan, EnvOpenCL]>;
+defm SPV_KHR_terminate_invocation : ExtensionOperand<69, [EnvVulkan]>;
+defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70, [EnvVulkan]>;
+defm SPV_EXT_shader_image_int64 : ExtensionOperand<71, [EnvVulkan]>;
+defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73, [EnvOpenCL]>;
+defm SPV_INTEL_loop_fuse : ExtensionOperand<74, [EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float_min_max
+ : ExtensionOperand<75, [EnvVulkan, EnvOpenCL]>;
+defm SPV_KHR_workgroup_memory_explicit_layout
+ : ExtensionOperand<76, [EnvVulkan]>;
+defm SPV_KHR_linkonce_odr : ExtensionOperand<77, [EnvOpenCL]>;
+defm SPV_KHR_expect_assume : ExtensionOperand<78, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79, [EnvOpenCL]>;
+defm SPV_NV_bindless_texture : ExtensionOperand<80, [EnvVulkan]>;
+defm SPV_INTEL_fpga_invocation_pipelining_attributes
+ : ExtensionOperand<81, [EnvOpenCL]>;
+defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82, [EnvVulkan]>;
+defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83, [EnvVulkan]>;
+defm SPV_KHR_integer_dot_product : ExtensionOperand<84, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_shader_atomic_float16_add
+ : ExtensionOperand<85, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_runtime_aligned : ExtensionOperand<86, [EnvOpenCL]>;
+defm SPV_KHR_bit_instructions : ExtensionOperand<87, [EnvOpenCL]>;
+defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88, [EnvVulkan]>;
+defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89, [EnvOpenCL]>;
+defm SPV_KHR_subgroup_rotate : ExtensionOperand<90, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_split_barrier : ExtensionOperand<91, [EnvOpenCL]>;
+defm SPV_KHR_ray_cull_mask : ExtensionOperand<92, [EnvVulkan]>;
+defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93, [EnvVulkan]>;
+defm SPV_EXT_relaxed_printf_string_address_space
+ : ExtensionOperand<94, [EnvOpenCL]>;
+defm SPV_EXT_mesh_shader : ExtensionOperand<96, [EnvVulkan]>;
+defm SPV_ARM_core_builtins : ExtensionOperand<97, [EnvVulkan]>;
+defm SPV_EXT_opacity_micromap : ExtensionOperand<98, [EnvVulkan]>;
+defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99, [EnvVulkan]>;
+defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101, [EnvOpenCL]>;
+defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102, [EnvOpenCL]>;
+defm SPV_INTEL_optnone : ExtensionOperand<103, [EnvOpenCL]>;
+defm SPV_INTEL_function_pointers : ExtensionOperand<104, [EnvOpenCL]>;
+defm SPV_INTEL_variable_length_array : ExtensionOperand<105, [EnvOpenCL]>;
+defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106, [EnvOpenCL]>;
+defm SPV_INTEL_inline_assembly : ExtensionOperand<107, [EnvOpenCL]>;
+defm SPV_INTEL_cache_controls : ExtensionOperand<108, [EnvOpenCL]>;
+defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109, [EnvOpenCL]>;
+defm SPV_INTEL_global_variable_fpga_decorations
+ : ExtensionOperand<110, [EnvOpenCL]>;
+defm SPV_KHR_cooperative_matrix : ExtensionOperand<111, [EnvVulkan, EnvOpenCL]>;
+defm SPV_EXT_arithmetic_fence : ExtensionOperand<112, [EnvOpenCL]>;
+defm SPV_EXT_optnone : ExtensionOperand<113, [EnvOpenCL]>;
+defm SPV_INTEL_joint_matrix : ExtensionOperand<114, [EnvOpenCL]>;
+defm SPV_INTEL_float_controls2 : ExtensionOperand<115, [EnvOpenCL]>;
+defm SPV_INTEL_bindless_images : ExtensionOperand<116, [EnvOpenCL]>;
+defm SPV_INTEL_long_composites : ExtensionOperand<117, [EnvOpenCL]>;
+defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118, [EnvOpenCL]>;
+defm SPV_INTEL_fp_max_error : ExtensionOperand<119, [EnvOpenCL]>;
+defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120, [EnvOpenCL]>;
+defm SPV_INTEL_subgroup_matrix_multiply_accumulate
+ : ExtensionOperand<121, [EnvOpenCL]>;
+defm SPV_INTEL_2d_block_io : ExtensionOperand<122, [EnvOpenCL]>;
+defm SPV_INTEL_int4 : ExtensionOperand<123, [EnvOpenCL]>;
+defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -342,7 +403,9 @@ class Capability<string name, bits<32> value> {
multiclass CapabilityOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def NAME : Capability<NAME, value>;
- defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm Matrix : CapabilityOperand<0, 0, 0, [], []>;
@@ -551,7 +614,8 @@ class SourceLanguage<string name, bits<32> value> {
multiclass SourceLanguageOperand<bits<32> value> {
def : SourceLanguage<NAME, value>;
- defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0,
+ 0, [], [], []>;
}
defm Unknown : SourceLanguageOperand<0>;
@@ -580,7 +644,8 @@ class AddressingModel<string name, bits<32> value> {
multiclass AddressingModelOperand<bits<32> value, list<Capability> reqCapabilities> {
def : AddressingModel<NAME, value>;
- defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Logical : AddressingModelOperand<0, []>;
@@ -607,7 +672,8 @@ class ExecutionModel<string name, bits<32> value> {
multiclass ExecutionModelOperand<bits<32> value, list<Capability> reqCapabilities> {
def : ExecutionModel<NAME, value>;
- defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Vertex : ExecutionModelOperand<0, [Shader]>;
@@ -645,7 +711,8 @@ class MemoryModel<string name, bits<32> value> {
multiclass MemoryModelOperand<bits<32> value, list<Capability> reqCapabilities> {
def : MemoryModel<NAME, value>;
- defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Simple : MemoryModelOperand<0, [Shader]>;
@@ -672,7 +739,8 @@ class ExecutionMode<string name, bits<32> value> {
multiclass ExecutionModeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : ExecutionMode<NAME, value>;
- defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Invocations : ExecutionModeOperand<0, [Geometry]>;
@@ -748,7 +816,8 @@ class StorageClass<string name, bits<32> value> {
multiclass StorageClassOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : StorageClass<NAME, value>;
- defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0,
+ reqExtensions, reqCapabilities, []>;
}
defm UniformConstant : StorageClassOperand<0, [], []>;
@@ -794,7 +863,8 @@ class Dim<string name, bits<32> value> {
multiclass DimOperand<bits<32> value, string mnemonic, list<Capability> reqCapabilities> {
def NAME : Dim<NAME, value>;
- defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [],
+ reqCapabilities, []>;
}
defm DIM_1D : DimOperand<0, "1D", [Sampled1D, Image1D]>;
@@ -824,7 +894,8 @@ class SamplerAddressingMode<string name, bits<32> value> {
multiclass SamplerAddressingModeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : SamplerAddressingMode<NAME, value>;
- defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value,
+ NAME, 0, 0, [], reqCapabilities, []>;
}
defm None : SamplerAddressingModeOperand<0, [Kernel]>;
@@ -852,7 +923,8 @@ class SamplerFilterMode<string name, bits<32> value> {
multiclass SamplerFilterModeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : SamplerFilterMode<NAME, value>;
- defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME,
+ 0, 0, [], reqCapabilities, []>;
}
defm Nearest : SamplerFilterModeOperand<0, [Kernel]>;
@@ -877,7 +949,8 @@ class ImageFormat<string name, bits<32> value> {
multiclass ImageFormatOperand<bits<32> value, list<Capability> reqCapabilities> {
def NAME : ImageFormat<NAME, value>;
- defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Unknown : ImageFormatOperand<0, []>;
@@ -940,7 +1013,8 @@ class ImageChannelOrder<string name, bits<32> value> {
multiclass ImageChannelOrderOperand<bits<32> value, list<Capability> reqCapabilities> {
def : ImageChannelOrder<NAME, value>;
- defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME,
+ 0, 0, [], reqCapabilities, []>;
}
defm R : ImageChannelOrderOperand<0, [Kernel]>;
@@ -983,7 +1057,8 @@ class ImageChannelDataType<string name, bits<32> value> {
multiclass ImageChannelDataTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : ImageChannelDataType<NAME, value>;
- defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value,
+ NAME, 0, 0, [], reqCapabilities, []>;
}
defm SnormInt8 : ImageChannelDataTypeOperand<0, []>;
@@ -1023,7 +1098,8 @@ class ImageOperand<string name, bits<32> value> {
multiclass ImageOperandOperand<bits<32> value, list<Capability> reqCapabilities> {
def : ImageOperand<NAME, value>;
- defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm None : ImageOperandOperand<0x0, []>;
@@ -1061,7 +1137,8 @@ class FPFastMathMode<string name, bits<32> value> {
multiclass FPFastMathModeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : FPFastMathMode<NAME, value>;
- defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm None : FPFastMathModeOperand<0x0, []>;
@@ -1090,7 +1167,8 @@ class FPRoundingMode<string name, bits<32> value> {
multiclass FPRoundingModeOperand<bits<32> value> {
def NAME : FPRoundingMode<NAME, value>;
- defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0,
+ 0, [], [], []>;
}
defm RTE : FPRoundingModeOperand<0>;
@@ -1117,7 +1195,8 @@ class LinkageType<string name, bits<32> value> {
multiclass LinkageTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : LinkageType<NAME, value>;
- defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm Export : LinkageTypeOperand<0, [Linkage]>;
@@ -1143,7 +1222,8 @@ class AccessQualifier<string name, bits<32> value> {
multiclass AccessQualifierOperand<bits<32> value, list<Capability> reqCapabilities> {
def NAME : AccessQualifier<NAME, value>;
- defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0,
+ 0, [], reqCapabilities, []>;
}
defm ReadOnly : AccessQualifierOperand<0, [Kernel]>;
@@ -1170,7 +1250,9 @@ class FunctionParameterAttribute<string name, bits<32> value> {
multiclass FunctionParameterAttributeOperand<bits<32> value, list<Capability> reqCapabilities> {
def : FunctionParameterAttribute<NAME, value>;
- defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand, value, NAME, 0, 0, [], reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand,
+ value, NAME, 0, 0, [],
+ reqCapabilities, []>;
}
defm Zext : FunctionParameterAttributeOperand<0, [Kernel]>;
@@ -1202,7 +1284,9 @@ class Decoration<string name, bits<32> value> {
multiclass DecorationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : Decoration<NAME, value>;
- defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm RelaxedPrecision : DecorationOperand<0, 0, 0, [], [Shader]>;
@@ -1303,7 +1387,9 @@ class BuiltIn<string name, bits<32> value> {
multiclass BuiltInOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def NAME : BuiltIn<NAME, value>;
- defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm Position : BuiltInOperand<0, 0, 0, [], [Shader]>;
@@ -1417,7 +1503,8 @@ class SelectionControl<string name, bits<32> value> {
multiclass SelectionControlOperand<bits<32> value> {
def : SelectionControl<NAME, value>;
- defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME,
+ 0, 0, [], [], []>;
}
defm None : SelectionControlOperand<0x0>;
@@ -1443,7 +1530,8 @@ class LoopControl<string name, bits<32> value> {
multiclass LoopControlOperand<bits<32> value> {
def : LoopControl<NAME, value>;
- defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0,
+ 0, [], [], []>;
}
defm None : LoopControlOperand<0x0>;
@@ -1476,7 +1564,8 @@ class FunctionControl<string name, bits<32> value> {
multiclass FunctionControlOperand<bits<32> value> {
def : FunctionControl<NAME, value>;
- defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0,
+ 0, [], [], []>;
}
defm None : FunctionControlOperand<0x0>;
@@ -1506,7 +1595,9 @@ class MemorySemantics<string name, bits<32> value> {
multiclass MemorySemanticsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : MemorySemantics<NAME, value>;
- defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm None : MemorySemanticsOperand<0x0, 0, 0, [], []>;
@@ -1544,7 +1635,9 @@ class MemoryOperand<string name, bits<32> value> {
multiclass MemoryOperandOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : MemoryOperand<NAME, value>;
- defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm None : MemoryOperandOperand<0x0, 0, 0, [], []>;
@@ -1577,7 +1670,9 @@ class Scope<string name, bits<32> value> {
multiclass ScopeOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : Scope<NAME, value>;
- defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion,
+ maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm CrossDevice : ScopeOperand<0, 0, 0, [], []>;
@@ -1607,7 +1702,9 @@ class GroupOperation<string name, bits<32> value> {
multiclass GroupOperationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def NAME : GroupOperation<NAME, value>;
- defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm Reduce : GroupOperationOperand<0, 0, 0, [], [Kernel, GroupNonUniformArithmetic, GroupNonUniformBallot]>;
@@ -1638,7 +1735,9 @@ class KernelEnqueueFlags<string name, bits<32> value> {
multiclass KernelEnqueueFlagsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : KernelEnqueueFlags<NAME, value>;
- defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME,
+ minVersion, maxVersion, reqExtensions,
+ reqCapabilities, []>;
}
defm NoWait : KernelEnqueueFlagsOperand<0, 0, 0, [], [Kernel]>;
@@ -1665,7 +1764,9 @@ class KernelProfilingInfo<string name, bits<32> value> {
multiclass KernelProfilingInfoOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : KernelProfilingInfo<NAME, value>;
- defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value,
+ NAME, minVersion, maxVersion,
+ reqExtensions, reqCapabilities, []>;
}
defm None : KernelProfilingInfoOperand<0x0, 0, 0, [], []>;
@@ -1690,7 +1791,8 @@ class Opcode<string name, bits<32> value> {
multiclass OpcodeOperand<bits<32> value> {
def : Opcode<NAME, value>;
- defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>;
+ defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0,
+ 0, [], [], []>;
}
// TODO: implement other mnemonics.
defm InBoundsAccessChain : OpcodeOperand<66>;
@@ -1720,7 +1822,9 @@ class CooperativeMatrixLayout<string name, bits<32> value> {
multiclass CooperativeMatrixLayoutOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : CooperativeMatrixLayout<NAME, value>;
- defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value,
+ NAME, 0, 0, reqExtensions,
+ reqCapabilities, []>;
}
defm RowMajorKHR : CooperativeMatrixLayoutOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
@@ -1747,7 +1851,9 @@ class CooperativeMatrixOperands<string name, bits<32> value> {
multiclass CooperativeMatrixOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : CooperativeMatrixOperands<NAME, value>;
- defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand,
+ value, NAME, 0, 0, reqExtensions,
+ reqCapabilities, []>;
}
defm NoneKHR : CooperativeMatrixOperandsOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
@@ -1780,7 +1886,9 @@ class SpecConstantOpOperands<string name, bits<32> value> {
multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
def : SpecConstantOpOperands<NAME, value>;
- defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+ defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value,
+ NAME, 0, 0, reqExtensions,
+ reqCapabilities, []>;
}
// Conversion
@@ -1868,7 +1976,9 @@ class MatrixMultiplyAccumulateOperands<string name, bits<32> value> {
multiclass MatrixMultiplyAccumulateOperandsOperand<bits<32> value, list<Extension> reqExtensions> {
def : MatrixMultiplyAccumulateOperands<NAME, value>;
- defm : SymbolicOperandWithRequirements<MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, reqExtensions, []>;
+ defm : SymbolicOperandWithRequirements<
+ MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0,
+ reqExtensions, [], []>;
}
defm None : MatrixMultiplyAccumulateOperandsOperand<0x0, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index c0fc3a6..dd22132 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1799,12 +1799,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCOS , MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FREM , MVT::f64, Expand);
- setOperationAction(ISD::FMA , MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f64,
+ Subtarget->isUA2007() ? Legal : Expand);
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM , MVT::f32, Expand);
- setOperationAction(ISD::FMA, MVT::f32, Expand);
+ setOperationAction(ISD::FMA, MVT::f32,
+ Subtarget->isUA2007() ? Legal : Expand);
setOperationAction(ISD::ROTL , MVT::i32, Expand);
setOperationAction(ISD::ROTR , MVT::i32, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -2278,21 +2280,15 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- ArgListEntry Entry;
- Entry.Node = Arg;
- Entry.Ty = ArgTy;
-
if (ArgTy->isFP128Ty()) {
// Create a stack object and pass the pointer to the library function.
int FI = MFI.CreateStackObject(16, Align(8), false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
- Align(8));
-
- Entry.Node = FIPtr;
- Entry.Ty = PointerType::getUnqual(ArgTy->getContext());
+ Chain = DAG.getStore(Chain, DL, Arg, FIPtr, MachinePointerInfo(), Align(8));
+ Args.emplace_back(FIPtr, PointerType::getUnqual(ArgTy->getContext()));
+ } else {
+ Args.emplace_back(Arg, ArgTy);
}
- Args.push_back(Entry);
return Chain;
}
@@ -2314,11 +2310,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
if (RetTy->isFP128Ty()) {
// Create a Stack Object to receive the return value of type f128.
- ArgListEntry Entry;
int RetFI = MFI.CreateStackObject(16, Align(8), false);
RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
- Entry.Node = RetPtr;
- Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+ ArgListEntry Entry(RetPtr, PointerType::getUnqual(RetTy->getContext()));
if (!Subtarget->is64Bit()) {
Entry.IsSRet = true;
Entry.IndirectType = RetTy;
@@ -3550,6 +3544,11 @@ bool SparcTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
return isCheapToSpeculateCtlz(Ty);
}
+bool SparcTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
+ return Subtarget->isUA2007() && !Subtarget->useSoftFloat();
+}
+
// Override to disable global variable loading on Linux.
void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
if (!Subtarget->isTargetLinux())
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 0d220f8..7fffb7c 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -28,6 +28,8 @@ namespace llvm {
bool useSoftFloat() const override;
+ bool softPromoteHalfType() const override { return true; }
+
/// computeKnownBitsForTargetNode - Determine which of the bits specified
/// in Mask are known to be either zero or one and return them in the
/// KnownZero/KnownOne bitsets.
@@ -177,6 +179,11 @@ namespace llvm {
bool isCheapToSpeculateCttz(Type *Ty) const override;
+ bool enableAggressiveFMAFusion(EVT VT) const override { return true; };
+
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
+
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
// FIXME: We insert fences for each atomics and generate
// sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
index 3a30e55..ffd4423 100644
--- a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
+++ b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
@@ -66,3 +66,15 @@ defm CXBCOND : F2_56<"cxb", 1>;
def FPMADDX : FourOp<"fpmaddx", 0b110111, 0b0000, DFPRegs>;
def FPMADDXHI : FourOp<"fpmaddxhi", 0b110111, 0b0100, DFPRegs>;
} // Predicates = [HasOSA2011]
+
+// UA2007 instruction patterns.
+let Predicates = [HasUA2007] in {
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, f32:$add)), (FMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, f64:$add)), (FMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub))), (FMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub))), (FMSUBD $rs1, $rs2, $sub)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, f32:$add))), (FNMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, f64:$add))), (FNMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub)))), (FNMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub)))), (FNMSUBD $rs1, $rs2, $sub)>;
+} // Predicates = [HasUA2007]
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index fbb98ff..f5ffbf5 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -29,50 +29,6 @@ namespace SystemZ {
extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
} // end namespace SystemZ
-class SystemZCCState : public CCState {
-private:
- /// Records whether the value was widened from a short vector type.
- SmallVector<bool, 4> ArgIsShortVector;
-
- // Check whether ArgVT is a short vector type.
- bool IsShortVectorType(EVT ArgVT) {
- return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
- }
-
-public:
- SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
- SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
- : CCState(CC, isVarArg, MF, locs, C) {}
-
- void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
- CCAssignFn Fn) {
- // Record whether the call operand was a short vector.
- ArgIsShortVector.clear();
- for (unsigned i = 0; i < Ins.size(); ++i)
- ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
-
- CCState::AnalyzeFormalArguments(Ins, Fn);
- }
-
- void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
- CCAssignFn Fn) {
- // Record whether the call operand was a short vector.
- ArgIsShortVector.clear();
- for (unsigned i = 0; i < Outs.size(); ++i)
- ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
-
- CCState::AnalyzeCallOperands(Outs, Fn);
- }
-
- // This version of AnalyzeCallOperands in the base class is not usable
- // since we must provide a means of accessing ISD::OutputArg::IsShortVector.
- void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
- SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
- CCAssignFn Fn) = delete;
-
- bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
-};
-
// Handle i128 argument types. These need to be passed by implicit
// reference. This could be as simple as the following .td line:
// CCIfType<[i128], CCPassIndirect<i64>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 059f31f..2795de5 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -18,7 +18,7 @@ class CCIfSubtarget<string F, CCAction A>
// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
- : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
+ : CCIf<"OrigTy->isVectorTy() && OrigTy->getPrimitiveSizeInBits() <= 64", A>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 5ee66e3..dcefff9 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index fb0a47d..c73dc30 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1941,7 +1941,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
FuncInfo->setSizeOfFnParams(CCInfo.getStackSize());
@@ -2251,7 +2251,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
+ CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
// We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -2460,10 +2460,9 @@ std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
TargetLowering::ArgListTy Args;
Args.reserve(Ops.size());
- TargetLowering::ArgListEntry Entry;
for (SDValue Op : Ops) {
- Entry.Node = Op;
- Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, IsSigned);
Entry.IsZExt = !Entry.IsSExt;
Args.push_back(Entry);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962..707887c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -523,7 +523,7 @@ public:
bool MathUsed) const override {
// Form add and sub with overflow intrinsics regardless of any extra
// users of the math result.
- return VT == MVT::i32 || VT == MVT::i64;
+ return VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i128;
}
bool shouldConsiderGEPOffsetSplit() const override { return true; }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd..2611c29 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess(
C2.ScaleCost, C2.SetupCost);
}
-bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
- const Function *Callee) const {
- const TargetMachine &TM = getTLI()->getTargetMachine();
-
- const FeatureBitset &CallerBits =
- TM.getSubtargetImpl(*Caller)->getFeatureBits();
- const FeatureBitset &CalleeBits =
- TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
- // Support only equal feature bitsets. Restriction should be relaxed in the
- // future to allow inlining when callee's bits are subset of the caller's.
- return CallerBits == CalleeBits;
-}
-
unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
bool Vector = (ClassID == 1);
if (!Vector)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e..fc681de 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -65,9 +65,6 @@ public:
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const override;
- bool areInlineCompatible(const Function *Caller,
- const Function *Callee) const override;
-
/// @}
/// \name Vector TTI Implementations
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 9b03e85..28495e7 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -151,17 +151,17 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
M.getModuleFlagsMetadata(ModuleFlags);
- MDNode *CFGProfile = nullptr;
+ MDNode *CGProfile = nullptr;
for (const auto &MFE : ModuleFlags) {
StringRef Key = MFE.Key->getString();
if (Key == "CG Profile") {
- CFGProfile = cast<MDNode>(MFE.Val);
+ CGProfile = cast<MDNode>(MFE.Val);
break;
}
}
- if (!CFGProfile)
+ if (!CGProfile)
return;
auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
@@ -174,7 +174,7 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
return TM->getSymbol(F);
};
- for (const auto &Edge : CFGProfile->operands()) {
+ for (const auto &Edge : CGProfile->operands()) {
MDNode *E = cast<MDNode>(Edge);
const MCSymbol *From = GetSym(E->getOperand(0));
const MCSymbol *To = GetSym(E->getOperand(1));
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 9e8f400..2cfdc75 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1649,14 +1649,11 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
// Prepare arguments
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = Size;
- Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
- Args.push_back(Entry);
+ Args.emplace_back(Size, Size.getValueType().getTypeForEVT(*DAG.getContext()));
if (NeedsAlign) {
- Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
- Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
- Args.push_back(Entry);
+ SDValue Align = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
+ Args.emplace_back(Align,
+ Align.getValueType().getTypeForEVT(*DAG.getContext()));
}
Type *RetTy = Type::getVoidTy(*DAG.getContext());
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 80df4ed..45bbf12 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -220,7 +220,6 @@ static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
Sym->setFunctionTable(Is64);
// The default function table is synthesized by the linker.
- Sym->setUndefined();
}
return Sym;
}
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 2a398d4..fa6086c 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -26,7 +26,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/LEB128.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index ec95e86..2666342 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -912,6 +912,8 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
if (!IsVoid)
updateValueMap(Call, ResultReg);
+
+ diagnoseDontCall(*Call);
return true;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f9eba4b..35d5c3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1320,18 +1320,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
// signature They are necessary to match callee and caller signature for
// indirect call.
if (CallConv == CallingConv::Swift) {
+ Type *PtrTy = PointerType::getUnqual(*DAG.getContext());
if (!HasSwiftSelfArg) {
NumFixedArgs++;
- ISD::OutputArg Arg;
- Arg.Flags.setSwiftSelf();
+ ISD::ArgFlagsTy Flags;
+ Flags.setSwiftSelf();
+ ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0);
CLI.Outs.push_back(Arg);
SDValue ArgVal = DAG.getUNDEF(PtrVT);
CLI.OutVals.push_back(ArgVal);
}
if (!HasSwiftErrorArg) {
NumFixedArgs++;
- ISD::OutputArg Arg;
- Arg.Flags.setSwiftError();
+ ISD::ArgFlagsTy Flags;
+ Flags.setSwiftError();
+ ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0);
CLI.Outs.push_back(Arg);
SDValue ArgVal = DAG.getUNDEF(PtrVT);
CLI.OutVals.push_back(ArgVal);
@@ -3383,8 +3386,56 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
}
+/// Try to convert a i128 comparison to a v16i8 comparison before type
+/// legalization splits it up into chunks
+static SDValue
+combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const WebAssemblySubtarget *Subtarget) {
+
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT OpVT = X.getValueType();
+
+ SelectionDAG &DAG = DCI.DAG;
+ if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat))
+ return SDValue();
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ // We're looking for an oversized integer equality comparison with SIMD
+ if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
+ !Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
+ return SDValue();
+
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
+ };
+
+ if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
+ return SDValue();
+
+ SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
+ SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
+ SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
+
+ SDValue Intr =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
+ : Intrinsic::wasm_anytrue,
+ DL, MVT::i32),
+ Cmp});
+
+ return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32),
+ ISD::SETNE);
+}
+
static SDValue performSETCCCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const WebAssemblySubtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -3392,6 +3443,9 @@ static SDValue performSETCCCombine(SDNode *N,
if (!VT.isScalarInteger())
return SDValue();
+ if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
+ return V;
+
SDValue LHS = N->getOperand(0);
if (LHS->getOpcode() != ISD::BITCAST)
return SDValue();
@@ -3571,7 +3625,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST:
return performBitcastCombine(N, DCI);
case ISD::SETCC:
- return performSETCCCombine(N, DCI);
+ return performSETCCCombine(N, DCI, Subtarget);
case ISD::VECTOR_SHUFFLE:
return performVECTOR_SHUFFLECombine(N, DCI);
case ISD::SIGN_EXTEND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 4548a75..45b0e7d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -533,8 +533,8 @@ struct StaticLibcallNameMap {
// different libcalls.
RTLIB::RuntimeLibcallsInfo RTCI(TT);
for (RTLIB::Libcall LC : RTLIB::libcalls()) {
- const char *NameLibcall = RTCI.getLibcallName(LC);
- if (NameLibcall != nullptr &&
+ StringRef NameLibcall = RTCI.getLibcallName(LC);
+ if (!NameLibcall.empty() &&
getRuntimeLibcallSignatures().Table[LC] != unsupported) {
assert(!Map.contains(NameLibcall) &&
"duplicate libcall names in name map");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 52e7065..08fb758 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
Options.AllowOverlappingLoads = true;
- // TODO: Teach WebAssembly backend about load v128.
+ if (ST->hasSIMD128())
+ Options.LoadSizes.push_back(16);
Options.LoadSizes.append({8, 4, 2, 1});
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 42d1271..8904867 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -113,7 +113,6 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
Sym->setFunctionTable(is64);
// The default function table is synthesized by the linker.
- Sym->setUndefined();
}
// MVP object files can't have symtab entries for tables.
if (!(Subtarget && Subtarget->hasCallIndirectOverlong()))
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index d9f4405..c0b9339 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -69,7 +69,7 @@ public:
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
- bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State);
StackSize = State.getStackSize();
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
index 817e88d..e2a1bbf3 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
}
}
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+ const MCInst &MCI) {
+ // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we
+ // have not done the necessary benchmarking to see if they are also
+ // optimized by the stack engine.
+ // TODO: We currently just remove all RSP writes from stack operations. This
+ // is not fully correct because we do not model sync uops which will
+ // delay subsequent rsp using non-stack instructions.
+ if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
+ auto *StackRegisterDef =
+ llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
+ return State.getRegisterID() == X86::RSP;
+ });
+ assert(
+ StackRegisterDef != Inst->getDefs().end() &&
+ "Expected push instruction to implicitly use stack pointer register.");
+ Inst->getDefs().erase(StackRegisterDef);
+ }
+}
+
void X86InstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
- // Currently, we only modify certain instructions' IsALoadBarrier and
- // IsAStoreBarrier flags.
+ // Set IsALoadBarrier and IsAStoreBarrier flags.
setMemBarriers(Inst, MCI);
+ useStackEngine(Inst, MCI);
}
} // namespace mca
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 4a83ba8..c5459e4 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
/// as load and store barriers.
void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+ /// Called within X86InstrPostPorcess to remove some rsp read operands
+ /// on stack instructions to better simulate the stack engine. We currently
+ /// do not model features of the stack engine like sync uops.
+ void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
public:
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 990b381..3d34ea3 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1291,7 +1291,9 @@ def ProcessorFeatures {
list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
TuningPreferMovmskOverVTest,
TuningFastImmVectorShift];
- list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
+ list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps];
+ list<SubtargetFeature> ADLTuning =
+ !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning);
list<SubtargetFeature> ADLFeatures =
!listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h
index 191e0fa..8e37f34 100644
--- a/llvm/lib/Target/X86/X86CallingConv.h
+++ b/llvm/lib/Target/X86/X86CallingConv.h
@@ -22,10 +22,10 @@ namespace llvm {
bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State);
+ Type *OrigTy, CCState &State);
bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State);
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State);
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 067bd43..f007886 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3323,6 +3323,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
SmallVector<MVT, 16> OutVTs;
+ SmallVector<Type *, 16> ArgTys;
SmallVector<Register, 16> ArgRegs;
// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -3369,6 +3370,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
ArgRegs.push_back(ResultReg);
OutVTs.push_back(VT);
+ ArgTys.push_back(Val->getType());
}
// Analyze operands of the call, assigning locations to each operand.
@@ -3379,7 +3381,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (IsWin64)
CCInfo.AllocateStack(32, Align(8));
- CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+ CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, ArgTys, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 95ed590..cba7843 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/EHPersonalities.h"
@@ -2678,7 +2679,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
// object.
// We need to factor in additional offsets applied during the prologue to the
// frame, base, and stack pointer depending on which is used.
- int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
uint64_t StackSize = MFI.getStackSize();
@@ -4212,6 +4213,14 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// emitPrologue if it gets called and emits CFI.
MF.setHasWinCFI(false);
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // If the frame is big enough that we might need to scavenge a register to
+ // handle huge offsets, reserve a stack slot for that now.
+ if (!isInt<32>(MFI.estimateStackSize(MF))) {
+ int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false);
+ RS->addScavengingFrameIndex(FI);
+ }
+
// If we are using Windows x64 CFI, ensure that the stack is always 8 byte
// aligned. The format doesn't support misaligned stack adjustments.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f366094..8c3380b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
!Subtarget.hasBWI())
return TypeSplitVector;
+ // Since v8f16 is legal, widen anything over v4f16.
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
- !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+ VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
+ VT.getVectorElementType() == MVT::f16)
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
@@ -15419,18 +15421,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return SDValue();
}
- // Avoid returning the same shuffle operation. For example,
- // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
- // undef:v16i16
- if (CrossLaneMask == Mask || InLaneMask == Mask)
- return SDValue();
-
// Simplify CrossLaneMask based on the actual demanded elements.
if (V1.hasOneUse())
for (int i = 0; i != NumElts; ++i)
if (!DemandedCrossLane[i])
CrossLaneMask[i] = SM_SentinelUndef;
+ // Avoid returning the same shuffle operation. For example,
+ // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+ // undef:v16i16
+ if (CrossLaneMask == Mask || InLaneMask == Mask)
+ return SDValue();
+
SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
InLaneMask);
@@ -22219,9 +22221,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
In = DAG.getBitcast(MVT::i16, In);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = false;
Entry.IsZExt = true;
Args.push_back(Entry);
@@ -22318,9 +22319,8 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
+ TargetLowering::ArgListEntry Entry(
+ In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
Entry.IsSExt = false;
Entry.IsZExt = true;
Args.push_back(Entry);
@@ -30049,7 +30049,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
EVT ArgVT = Op->getOperand(i).getValueType();
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
@@ -30058,13 +30057,9 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
- Entry.Node = StackPtr;
InChain =
DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
- Entry.Ty = PointerType::get(*DAG.getContext(), 0);
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
}
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
@@ -33087,13 +33082,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Node = Arg;
- Entry.Ty = ArgTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
- Args.push_back(Entry);
+ Args.emplace_back(Arg, ArgTy);
bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
@@ -45163,6 +45152,9 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
return false;
+ // SSE signbit extraction.
+ case X86ISD::MOVMSK:
+ return false;
case ISD::INTRINSIC_WO_CHAIN:
switch (Op->getConstantOperandVal(0)) {
case Intrinsic::x86_sse2_pmadd_wd:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 547b221..3dd79b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1663,8 +1663,8 @@ namespace llvm {
/// instructions/intrinsics.
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 636b072..632db7e 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
- assert(!Mask && "Unexpected mask on a load");
+ assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
// Create an interleaved access group.
IRBuilder<> Builder(LI);
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index cf055cf..090060e 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
NumGadgets += GadgetCount;
// Traverse CFG to build the rest of the graph
- SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+ SmallPtrSet<MachineBasicBlock *, 8> BlocksVisited;
std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
[&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
unsigned LoopDepth = MLI.getLoopDepth(MBB);
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 3b4e531..2a1c499 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -100,7 +100,7 @@ struct BBInfo {
class X86PreTileConfig : public MachineFunctionPass {
MachineRegisterInfo *MRI = nullptr;
const MachineLoopInfo *MLI = nullptr;
- SmallSet<MachineInstr *, 8> DefVisited;
+ SmallPtrSet<MachineInstr *, 8> DefVisited;
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 83b11ee..595ad32 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -21,8 +21,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TileShapeInfo.h"
@@ -907,7 +907,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
// Determine base register and offset.
- int FIOffset;
+ int64_t FIOffset;
Register BasePtr;
if (MI.isReturn()) {
assert((!hasStackRealignment(MF) ||
@@ -958,11 +958,41 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
if (MI.getOperand(FIOperandNum+3).isImm()) {
- // Offset is a 32-bit integer.
- int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
- int Offset = FIOffset + Imm;
- assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
- "Requesting 64-bit offset in 32-bit immediate!");
+ const X86InstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm();
+ int64_t Offset = FIOffset + Imm;
+ bool FitsIn32Bits = isInt<32>(Offset);
+ // If the offset will not fit in a 32-bit displacement, then for 64-bit
+ // targets, scavenge a register to hold it. Otherwise...
+ if (Is64Bit && !FitsIn32Bits) {
+ assert(RS && "RegisterScavenger was NULL");
+
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(std::next(II));
+
+ Register ScratchReg = RS->scavengeRegisterBackwards(
+ X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0,
+ /*AllowSpill=*/true);
+ assert(ScratchReg != 0 && "scratch reg was 0");
+ RS->setRegUsed(ScratchReg);
+
+ BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset);
+
+ MI.getOperand(FIOperandNum + 3).setImm(0);
+ MI.getOperand(FIOperandNum + 2).setReg(ScratchReg);
+
+ return false;
+ }
+
+ // ... for 32-bit targets, this is a bug!
+ if (!Is64Bit && !FitsIn32Bits) {
+ MI.emitGenericError("64-bit offset calculated but target is 32-bit");
+ // Trap so that the instruction verification pass does not fail if run.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TRAP));
+ return false;
+ }
+
if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
} else {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 19b409a..2f4c55c 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#define GET_REGINFO_HEADER
@@ -180,6 +181,10 @@ public:
constrainRegClassToNonRex2(const TargetRegisterClass *RC) const;
bool isNonRex2RegClass(const TargetRegisterClass *RC) const;
+
+ bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+ return true;
+ }
};
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 8cd52e2..f15a7c7 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -70,6 +70,12 @@ def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
let BufferSize=60;
}
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKRCU : RetireControlUnit<0, 8>;
+
// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 14a51d1e..2a793d0 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -70,6 +70,12 @@ def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
let BufferSize=60;
}
+// Skylake can retire up to four (potentially fused) uops per cycle. Set the
+// limit to twice that given we do not model fused uops as only taking up one
+// retirement slot. I could not find any documented sources on how many
+// in-flight micro-ops can be tracked.
+def SKXRCU : RetireControlUnit<0, 8>;
+
// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index c92bc97..133c1a4 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -562,14 +562,7 @@ def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
let ReleaseAtCycles = [8,8];
let NumMicroOps = 4;
}
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>;
-
-def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
- let Latency = 8;
- let ReleaseAtCycles = [8,8];
- let NumMicroOps = 4;
-}
-def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>;
+def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSr(r|m)(_Int)?")>;
def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> {
let Latency = 9;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 90791fc..62f9527 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -161,19 +161,26 @@ std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
+enum ClassIDEnum { GPRClass = 0, VectorClass = 1, ScalarFPClass = 2 };
+
+unsigned X86TTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+ return Vector ? VectorClass
+ : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
+ : GPRClass;
+}
+
unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
- bool Vector = (ClassID == 1);
- if (Vector && !ST->hasSSE1())
+ if (ClassID == VectorClass && !ST->hasSSE1())
return 0;
- if (ST->is64Bit()) {
- if (Vector && ST->hasAVX512())
- return 32;
- if (!Vector && ST->hasEGPR())
- return 32;
- return 16;
- }
- return 8;
+ if (!ST->is64Bit())
+ return 8;
+
+ if ((ClassID == GPRClass && ST->hasEGPR()) ||
+ (ClassID != GPRClass && ST->hasAVX512()))
+ return 32;
+
+ return 16;
}
bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const {
@@ -5488,9 +5495,10 @@ InstructionCost X86TTIImpl::getPointersChainCost(
return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
}
-InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
- ScalarEvolution *SE,
- const SCEV *Ptr) const {
+InstructionCost
+X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -5504,7 +5512,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
// Even in the case of (loop invariant) stride whose value is not known at
// compile time, the address computation will not incur more than one extra
// ADD instruction.
- if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
+ if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
// TODO: AVX2 is the current cut-off because we don't have correct
// interleaving costs for prior ISA's.
if (!BaseT::isStridedAccess(Ptr))
@@ -5513,7 +5521,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
return 1;
}
- return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
}
InstructionCost
@@ -6525,8 +6533,8 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
for (const Instruction &I : instructions(Callee)) {
if (const auto *CB = dyn_cast<CallBase>(&I)) {
- // Having more target features is fine for inline ASM.
- if (CB->isInlineAsm())
+ // Having more target features is fine for inline ASM and intrinsics.
+ if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
continue;
SmallVector<Type *, 8> Types;
@@ -6542,19 +6550,9 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
if (all_of(Types, IsSimpleTy))
continue;
- if (Function *NestedCallee = CB->getCalledFunction()) {
- // Assume that intrinsics are always ABI compatible.
- if (NestedCallee->isIntrinsic())
- continue;
-
- // Do a precise compatibility check.
- if (!areTypesABICompatible(Caller, NestedCallee, Types))
- return false;
- } else {
- // We don't know the target features of the callee,
- // assume it is incompatible.
+ // Do a precise compatibility check.
+ if (!areTypesABICompatible(Caller, Callee, Types))
return false;
- }
}
}
return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c47..133b366 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ public:
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const override;
+ unsigned getRegisterClassForType(bool Vector, Type *Ty) const override;
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override;
TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
@@ -194,8 +195,9 @@ public:
getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
const TTI::PointersChainInfo &Info, Type *AccessTy,
TTI::TargetCostKind CostKind) const override;
- InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index e9081a4..ea8b88f 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -190,6 +190,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
State = FunctionState::FinishedEpilog;
break;
+ case X86::LEA64r:
case X86::MOV64rr:
case X86::ADD64ri32:
if (State == FunctionState::InEpilog) {
@@ -201,51 +202,56 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
"The epilog is deallocating a stack "
"allocation, but the prolog did "
"not allocate one");
- if (HasStackDealloc)
+ if (PoppedRegCount > 0)
return rejectCurrentFunctionInternalError(
MF, Mode,
- "The epilog is deallocating the stack "
- "allocation more than once");
- if (PoppedRegCount > 0)
- llvm_unreachable(
- "Should have raised an error: either popping before "
- "deallocating or deallocating without an allocation");
+ "The epilog is deallocating a stack allocation after popping "
+ "registers");
HasStackDealloc = true;
} else if (State == FunctionState::FinishedEpilog)
return rejectCurrentFunctionInternalError(
- MF, Mode, "Unexpected mov or add instruction after the epilog");
+ MF, Mode,
+ "Unexpected lea, mov or add instruction after the epilog");
break;
case X86::POP64r:
if (State == FunctionState::InEpilog) {
- // After the stack pointer has been adjusted, the epilog must
- // POP each register in reverse order of the PUSHes in the prolog.
- PoppedRegCount++;
- if (HasStackAlloc != HasStackDealloc)
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "Cannot pop registers before the stack "
- "allocation has been deallocated");
- if (PoppedRegCount > PushedRegs.size())
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "The epilog is popping more registers than the prolog pushed");
- if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
- MI.getOperand(0).getReg())
- return rejectCurrentFunctionInternalError(
- MF, Mode,
- "The epilog is popping a registers in "
- "a different order than the "
- "prolog pushed them");
-
- // Unwind v2 records the size of the epilog not from where we place
- // SEH_BeginEpilogue (as that contains the instruction to adjust the
- // stack pointer) but from the first POP instruction (if there is
- // one).
- if (!UnwindV2StartLocation) {
- assert(PoppedRegCount == 1);
- UnwindV2StartLocation = &MI;
+ Register Reg = MI.getOperand(0).getReg();
+ if (HasStackAlloc && (PoppedRegCount == 0) &&
+ !llvm::is_contained(PushedRegs, Reg)) {
+ // If this is a pop that doesn't correspond to the set of pushed
+ // registers, then assume it was used to adjust the stack pointer.
+ HasStackDealloc = true;
+ } else {
+ // After the stack pointer has been adjusted, the epilog must
+ // POP each register in reverse order of the PUSHes in the prolog.
+ PoppedRegCount++;
+ if (HasStackAlloc != HasStackDealloc)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "Cannot pop registers before the stack "
+ "allocation has been deallocated");
+ if (PoppedRegCount > PushedRegs.size())
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is popping more registers than the prolog "
+ "pushed");
+ if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id())
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is popping a registers in "
+ "a different order than the "
+ "prolog pushed them");
+
+ // Unwind v2 records the size of the epilog not from where we place
+ // SEH_BeginEpilogue (as that contains the instruction to adjust the
+ // stack pointer) but from the first POP instruction (if there is
+ // one).
+ if (!UnwindV2StartLocation) {
+ assert(PoppedRegCount == 1);
+ UnwindV2StartLocation = &MI;
+ }
}
} else if (State == FunctionState::FinishedEpilog)
// Unexpected instruction after the epilog.
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index ef4cfcd..0a96ab2 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -429,11 +429,7 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// Lower to a call to __misaligned_load(BasePtr).
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Ty = IntPtrTy;
- Entry.Node = BasePtr;
- Args.push_back(Entry);
+ Args.emplace_back(BasePtr, IntPtrTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
@@ -480,14 +476,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Lower to a call to __misaligned_store(BasePtr, Value).
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Ty = IntPtrTy;
- Entry.Node = BasePtr;
- Args.push_back(Entry);
-
- Entry.Node = Value;
- Args.push_back(Entry);
+ Args.emplace_back(BasePtr, IntPtrTy);
+ Args.emplace_back(Value, IntPtrTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain).setCallee(
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 1bd92a2..f61115e 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -33,11 +33,10 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
- Entry.Node = Dst; Args.push_back(Entry);
- Entry.Node = Src; Args.push_back(Entry);
- Entry.Node = Size; Args.push_back(Entry);
+ Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Args.emplace_back(Dst, ArgTy);
+ Args.emplace_back(Src, ArgTy);
+ Args.emplace_back(Size, ArgTy);
const char *MemcpyAlign4Name = TLI.getLibcallName(RTLIB::MEMCPY_ALIGN_4);
CallingConv::ID CC = TLI.getLibcallCallingConv(RTLIB::MEMCPY_ALIGN_4);
diff --git a/llvm/lib/Target/Xtensa/Xtensa.td b/llvm/lib/Target/Xtensa/Xtensa.td
index 2c4bacb..4ef885e1 100644
--- a/llvm/lib/Target/Xtensa/Xtensa.td
+++ b/llvm/lib/Target/Xtensa/Xtensa.td
@@ -23,10 +23,8 @@ include "XtensaFeatures.td"
//===----------------------------------------------------------------------===//
// Xtensa supported processors.
//===----------------------------------------------------------------------===//
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-def : Proc<"generic", []>;
+include "XtensaProcessors.td"
//===----------------------------------------------------------------------===//
// Register File Description
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index 6a07bd8..f136703 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -353,7 +353,8 @@ static const MCPhysReg IntRegs[] = {Xtensa::A2, Xtensa::A3, Xtensa::A4,
static bool CC_Xtensa_Custom(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State) {
if (ArgFlags.isByVal()) {
Align ByValAlign = ArgFlags.getNonZeroByValAlign();
unsigned ByValSize = ArgFlags.getByValSize();
diff --git a/llvm/lib/Target/Xtensa/XtensaProcessors.td b/llvm/lib/Target/Xtensa/XtensaProcessors.td
new file mode 100644
index 0000000..0faf07d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaProcessors.td
@@ -0,0 +1,27 @@
+//===- XtensaProcessors.td - Xtensa Processors -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Xtensa supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def : Proc<"esp32", [FeatureDensity, FeatureSingleFloat, FeatureLoop, FeatureMAC16, FeatureWindowed, FeatureBoolean, FeatureSEXT,
+ FeatureNSA, FeatureMul16, FeatureMul32, FeatureMul32High, FeatureDFPAccel, FeatureS32C1I, FeatureTHREADPTR, FeatureDiv32,
+ FeatureDebug, FeatureException, FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel7, FeatureCoprocessor,
+ FeatureInterrupt, FeatureDataCache, FeatureRelocatableVector, FeatureTimers3, FeaturePRID, FeatureRegionProtection, FeatureMiscSR,
+ FeatureMINMAX, FeatureCLAMPS]>;
+
+def : Proc<"esp8266", [FeatureDensity, FeatureNSA, FeatureMul16, FeatureMul32, FeatureExtendedL32R, FeatureDebug, FeatureException,
+ FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel3, FeatureInterrupt, FeatureRelocatableVector, FeatureTimers1,
+ FeatureRegionProtection, FeaturePRID]>;
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 8f8b3a5..62e97bf 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_component_library(LLVMTargetParser
TargetParser.cpp
Triple.cpp
X86TargetParser.cpp
+ XtensaTargetParser.cpp
ADDITIONAL_HEADER_DIRS
Unix
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 22192e1f..2482753 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -759,20 +759,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
StringRef CPU;
switch (Family) {
- case 3:
+ case 0x3:
CPU = "i386";
break;
- case 4:
+ case 0x4:
CPU = "i486";
break;
- case 5:
+ case 0x5:
if (testFeature(X86::FEATURE_MMX)) {
CPU = "pentium-mmx";
break;
}
CPU = "pentium";
break;
- case 6:
+ case 0x6:
switch (Model) {
case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
// processor, Intel Core 2 Quad processor, Intel Core 2 Quad
@@ -1120,7 +1120,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
break;
}
break;
- case 15: {
+ case 0xf: {
if (testFeature(X86::FEATURE_64BIT)) {
CPU = "nocona";
break;
@@ -1132,7 +1132,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
CPU = "pentium4";
break;
}
- case 19:
+ case 0x13:
switch (Model) {
// Diamond Rapids:
case 0x01:
diff --git a/llvm/lib/TargetParser/XtensaTargetParser.cpp b/llvm/lib/TargetParser/XtensaTargetParser.cpp
new file mode 100644
index 0000000..25725f2
--- /dev/null
+++ b/llvm/lib/TargetParser/XtensaTargetParser.cpp
@@ -0,0 +1,93 @@
+//==-- XtensaTargetParser - Parser for Xtensa features ------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise Xtensa hardware features
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TargetParser/XtensaTargetParser.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace llvm {
+
+namespace Xtensa {
+struct CPUInfo {
+ StringLiteral Name;
+ CPUKind Kind;
+ uint64_t Features;
+};
+
+struct FeatureName {
+ uint64_t ID;
+ const char *NameCStr;
+ size_t NameLength;
+
+ StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+const FeatureName XtensaFeatureNames[] = {
+#define XTENSA_FEATURE(ID, NAME) {ID, "+" NAME, sizeof(NAME)},
+#include "llvm/TargetParser/XtensaTargetParser.def"
+};
+
+constexpr CPUInfo XtensaCPUInfo[] = {
+#define XTENSA_CPU(ENUM, NAME, FEATURES) {NAME, CK_##ENUM, FEATURES},
+#include "llvm/TargetParser/XtensaTargetParser.def"
+};
+
+StringRef getBaseName(StringRef CPU) {
+ return llvm::StringSwitch<StringRef>(CPU)
+#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(ANAME, NAME)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+ .Default(CPU);
+}
+
+StringRef getAliasName(StringRef CPU) {
+ return llvm::StringSwitch<StringRef>(CPU)
+#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(NAME, ANAME)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+ .Default(CPU);
+}
+
+CPUKind parseCPUKind(StringRef CPU) {
+ CPU = getBaseName(CPU);
+ return llvm::StringSwitch<CPUKind>(CPU)
+#define XTENSA_CPU(ENUM, NAME, FEATURES) .Case(NAME, CK_##ENUM)
+#include "llvm/TargetParser/XtensaTargetParser.def"
+ .Default(CK_INVALID);
+}
+
+// Get all features for the CPU
+void getCPUFeatures(StringRef CPU, std::vector<StringRef> &Features) {
+ CPU = getBaseName(CPU);
+ auto I = llvm::find_if(XtensaCPUInfo,
+ [&](const CPUInfo &CI) { return CI.Name == CPU; });
+ assert(I != std::end(XtensaCPUInfo) && "CPU not found!");
+ uint64_t Bits = I->Features;
+
+ for (const auto &F : XtensaFeatureNames) {
+ if ((Bits & F.ID) == F.ID)
+ Features.push_back(F.getName());
+ }
+}
+
+// Find all valid CPUs
+void fillValidCPUList(std::vector<StringRef> &Values) {
+ for (const auto &C : XtensaCPUInfo) {
+ if (C.Kind != CK_INVALID) {
+ Values.emplace_back(C.Name);
+ StringRef Name = getAliasName(C.Name);
+ if (Name != C.Name)
+ Values.emplace_back(Name);
+ }
+ }
+}
+
+} // namespace Xtensa
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 3320508..b775c43 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1821,7 +1821,7 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
// only used outside the region.
if (Valid && Lifetimes.size() != 0) {
auto *NewLifetime = Lifetimes[0]->clone();
- NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI);
+ NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(0), AI);
NewLifetime->insertBefore(DomBB->getTerminator()->getIterator());
// All the outsided lifetime.start markers are no longer necessary.
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index ab906f9..180ac9c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2252,6 +2252,10 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
UR.CWorklist.insert(CurrentSCC);
for (Function *Clone : Clones)
UR.CWorklist.insert(CG.lookupSCC(CG.get(*Clone)));
+ } else if (Shape.ABI == coro::ABI::Async) {
+ // Reprocess the function to inline the tail called return function of
+ // coro.async.end.
+ UR.CWorklist.insert(&C);
}
}
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 4e71768..d5d60a3 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -264,11 +264,6 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
}
void visitIntrinsicInst(IntrinsicInst &II) {
- // When we found the lifetime markers refers to a
- // subrange of the original alloca, ignore the lifetime
- // markers to avoid misleading the analysis.
- if (!IsOffsetKnown || !Offset.isZero())
- return Base::visitIntrinsicInst(II);
switch (II.getIntrinsicID()) {
default:
return Base::visitIntrinsicInst(II);
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index da60f52..042578d 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -53,7 +53,6 @@
#include "llvm/Transforms/IPO/ExpandVariadics.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
@@ -226,13 +225,6 @@ public:
/*IsVarArgs=*/false);
}
- static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
- AllocaInst *Alloced) {
- std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
- uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
- return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
- }
-
bool expansionApplicableToFunction(Module &M, Function *F) {
if (F->isIntrinsic() || !F->isVarArg() ||
F->hasFnAttribute(Attribute::Naked))
@@ -577,8 +569,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
AllocaInst *VaListInstance =
Builder.CreateAlloca(VaListTy, nullptr, "va_start");
- Builder.CreateLifetimeStart(VaListInstance,
- sizeOfAlloca(Ctx, DL, VaListInstance));
+ Builder.CreateLifetimeStart(VaListInstance);
Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
{VaListInstance});
@@ -595,8 +586,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
{VaListInstance});
- Builder.CreateLifetimeEnd(VaListInstance,
- sizeOfAlloca(Ctx, DL, VaListInstance));
+ Builder.CreateLifetimeEnd(VaListInstance);
if (Result->getType()->isVoidTy())
Builder.CreateRetVoid();
@@ -746,7 +736,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
// Initialize the fields in the struct
Builder.SetInsertPoint(CB);
- Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+ Builder.CreateLifetimeStart(Alloced);
Frame.initializeStructAlloca(DL, Builder, Alloced);
const unsigned NumArgs = FuncType->getNumParams();
@@ -762,7 +752,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
Builder.SetInsertPoint(CB);
- Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+ Builder.CreateLifetimeStart(VaList);
}
Builder.SetInsertPoint(CB);
Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
@@ -802,9 +792,9 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
}
if (VaList)
- Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+ Builder.CreateLifetimeEnd(VaList);
- Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+ Builder.CreateLifetimeEnd(Alloced);
NewCB->setAttributes(PAL);
NewCB->takeName(CB);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8262c8c..44394f6 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -273,7 +273,7 @@ MemoryEffects llvm::computeFunctionBodyMemoryAccess(Function &F,
/// Deduce readonly/readnone/writeonly attributes for the SCC.
template <typename AARGetterT>
static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
MemoryEffects ME = MemoryEffects::none();
MemoryEffects RecursiveArgME = MemoryEffects::none();
for (Function *F : SCCNodes) {
@@ -1002,7 +1002,7 @@ determinePointerAccessAttrs(Argument *A,
/// Deduce returned attributes for the SCC.
static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
// Check each function in turn, determining if an argument is always returned.
for (Function *F : SCCNodes) {
// We can infer and propagate function attributes only when we know that the
@@ -1238,7 +1238,7 @@ static bool inferInitializes(Argument &A, Function &F) {
/// Deduce nocapture attributes for the SCC.
static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed,
+ SmallPtrSet<Function *, 8> &Changed,
bool SkipInitializes) {
ArgumentGraph AG;
@@ -1510,7 +1510,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
/// Deduce noalias attributes for the SCC.
static void addNoAliasAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
// Check each function in turn, determining which functions return noalias
// pointers.
for (Function *F : SCCNodes) {
@@ -1623,7 +1623,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
/// Deduce nonnull attributes for the SCC.
static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
// Speculative that all functions in the SCC return only nonnull
// pointers. We may refute this as we analyze functions.
bool SCCReturnsNonNull = true;
@@ -1680,7 +1680,7 @@ static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
/// Deduce noundef attributes for the SCC.
static void addNoUndefAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
// Check each function in turn, determining which functions return noundef
// values.
for (Function *F : SCCNodes) {
@@ -1788,13 +1788,13 @@ public:
InferenceDescriptors.push_back(AttrInference);
}
- void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed);
+ void run(const SCCNodeSet &SCCNodes, SmallPtrSet<Function *, 8> &Changed);
};
/// Perform all the requested attribute inference actions according to the
/// attribute predicates stored before.
void AttributeInferer::run(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
// Go through all the functions in SCC and check corresponding attribute
// assumptions for each of them. Attributes that are invalid for this SCC
@@ -1969,7 +1969,7 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) {
///
/// Returns true if any changes to function attributes were made.
static void inferConvergent(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
AttributeInferer AI;
// Request to remove the convergent attribute from all functions in the SCC
@@ -2000,7 +2000,7 @@ static void inferConvergent(const SCCNodeSet &SCCNodes,
///
/// Returns true if any changes to function attributes were made.
static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
AttributeInferer AI;
if (!DisableNoUnwindInference)
@@ -2069,7 +2069,7 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
}
static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
// Try and identify functions that do not recurse.
// If the SCC contains multiple nodes we know for sure there is recursion.
@@ -2105,7 +2105,7 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
// Set the noreturn function attribute if possible.
static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
for (Function *F : SCCNodes) {
if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
F->doesNotReturn())
@@ -2166,7 +2166,7 @@ static bool allPathsGoThroughCold(Function &F) {
// Set the cold function attribute if possible.
static void addColdAttrs(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
for (Function *F : SCCNodes) {
if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot))
@@ -2213,7 +2213,7 @@ static bool functionWillReturn(const Function &F) {
// Set the willreturn function attribute if possible.
static void addWillReturn(const SCCNodeSet &SCCNodes,
- SmallSet<Function *, 8> &Changed) {
+ SmallPtrSet<Function *, 8> &Changed) {
for (Function *F : SCCNodes) {
if (!F || F->willReturn() || !functionWillReturn(*F))
continue;
@@ -2239,7 +2239,7 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
}
template <typename AARGetterT>
-static SmallSet<Function *, 8>
+static SmallPtrSet<Function *, 8>
deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
bool ArgAttrsOnly) {
SCCNodesResult Nodes = createSCCNodeSet(Functions);
@@ -2248,7 +2248,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
if (Nodes.SCCNodes.empty())
return {};
- SmallSet<Function *, 8> Changed;
+ SmallPtrSet<Function *, 8> Changed;
if (ArgAttrsOnly) {
// ArgAttrsOnly means to only infer attributes that may aid optimizations
// on the *current* function. "initializes" attribute is to aid
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 45fa9d5..9196a01 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -16,7 +16,6 @@
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/SCCPSolver.h"
@@ -400,12 +399,6 @@ Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
Constant *InstCostVisitor::visitCallBase(CallBase &I) {
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
- // Look through calls to ssa_copy intrinsics.
- if (auto *II = dyn_cast<IntrinsicInst>(&I);
- II && II->getIntrinsicID() == Intrinsic::ssa_copy) {
- return LastVisited->second;
- }
-
Function *F = I.getCalledFunction();
if (!F || !canConstantFoldCallTo(&I, F))
return nullptr;
@@ -611,17 +604,15 @@ void FunctionSpecializer::promoteConstantStackValues(Function *F) {
}
}
-// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
-// interfere with the promoteConstantStackValues() optimization.
+// The SCCP solver inserts bitcasts for PredicateInfo. These interfere with the
+// promoteConstantStackValues() optimization.
static void removeSSACopy(Function &F) {
for (BasicBlock &BB : F) {
for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
- auto *II = dyn_cast<IntrinsicInst>(&Inst);
- if (!II)
- continue;
- if (II->getIntrinsicID() != Intrinsic::ssa_copy)
+ auto *BC = dyn_cast<BitCastInst>(&Inst);
+ if (!BC || BC->getType() != BC->getOperand(0)->getType())
continue;
- Inst.replaceAllUsesWith(II->getOperand(0));
+ Inst.replaceAllUsesWith(BC->getOperand(0));
Inst.eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 45fb1f5..c576fbc 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -21,6 +21,8 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/CtorUtils.h"
@@ -30,6 +32,35 @@ using namespace llvm;
#define DEBUG_TYPE "globaldce"
+namespace {
+class GlobalDCELegacyPass : public ModulePass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ GlobalDCELegacyPass() : ModulePass(ID) {
+ initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ // Note: GlobalDCEPass does not use any analyses, so we're safe to call the
+ // new-pm style pass with a default-initialized analysis manager here
+ ModuleAnalysisManager MAM;
+ auto PA = Impl.run(M, MAM);
+ return !PA.areAllPreserved();
+ }
+
+private:
+ GlobalDCEPass Impl;
+};
+} // namespace
+
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce", "Dead Global Elimination",
+ false, false)
+
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCELegacyPass(); }
+
static cl::opt<bool>
ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true),
cl::desc("Enable virtual function elimination"));
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index bdda498..d7edd12 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1133,9 +1133,6 @@ static bool
optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
const DataLayout &DL,
function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- // Ignore no-op GEPs and bitcasts.
- StoredOnceVal = StoredOnceVal->stripPointerCasts();
-
// If we are dealing with a pointer global that is initialized to null and
// only has one (non-null) value stored into it, then we can optimize any
// users of the loaded value (often calls and loads) that would trap if the
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 6554377..88f5ca0 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -459,6 +459,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
}),
Calls.end());
+ // Report inlining decision BEFORE deleting function contents, so we
+ // can still access e.g. the DebugLoc
+ Advice->recordInliningWithCalleeDeleted();
// Clear the body and queue the function itself for call graph
// updating when we finish inlining.
makeFunctionBodyUnreachable(Callee);
@@ -470,9 +473,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
DeadFunctionsInComdats.push_back(&Callee);
}
}
- if (CalleeWasDeleted)
- Advice->recordInliningWithCalleeDeleted();
- else
+ if (!CalleeWasDeleted)
Advice->recordInlining();
}
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index 844e275..1185e63 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -284,6 +284,10 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
return Call.first->getCaller() == &Callee;
});
+
+ // Report inlining decision BEFORE deleting function contents, so we
+ // can still access e.g. the DebugLoc
+ Advice->recordInliningWithCalleeDeleted();
// Clear the body and queue the function itself for deletion when we
// finish inlining.
// Note that after this point, it is an error to do anything other
@@ -295,9 +299,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
CalleeWasDeleted = true;
}
}
- if (CalleeWasDeleted)
- Advice->recordInliningWithCalleeDeleted();
- else
+ if (!CalleeWasDeleted)
Advice->recordInlining();
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d7971e8..6e46898 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3740,6 +3740,82 @@ static Instruction *foldIntegerPackFromVector(Instruction &I,
return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType());
}
+/// Match \p V as "lshr -> mask -> zext -> shl".
+///
+/// \p Int is the underlying integer being extracted from.
+/// \p Mask is a bitmask identifying which bits of the integer are being
+/// extracted. \p Offset identifies which bit of the result \p V corresponds to
+/// the least significant bit of \p Int
+static bool matchZExtedSubInteger(Value *V, Value *&Int, APInt &Mask,
+ uint64_t &Offset, bool &IsShlNUW,
+ bool &IsShlNSW) {
+ Value *ShlOp0;
+ uint64_t ShlAmt = 0;
+ if (!match(V, m_OneUse(m_Shl(m_Value(ShlOp0), m_ConstantInt(ShlAmt)))))
+ return false;
+
+ IsShlNUW = cast<BinaryOperator>(V)->hasNoUnsignedWrap();
+ IsShlNSW = cast<BinaryOperator>(V)->hasNoSignedWrap();
+
+ Value *ZExtOp0;
+ if (!match(ShlOp0, m_OneUse(m_ZExt(m_Value(ZExtOp0)))))
+ return false;
+
+ Value *MaskedOp0;
+ const APInt *ShiftedMaskConst = nullptr;
+ if (!match(ZExtOp0, m_CombineOr(m_OneUse(m_And(m_Value(MaskedOp0),
+ m_APInt(ShiftedMaskConst))),
+ m_Value(MaskedOp0))))
+ return false;
+
+ uint64_t LShrAmt = 0;
+ if (!match(MaskedOp0,
+ m_CombineOr(m_OneUse(m_LShr(m_Value(Int), m_ConstantInt(LShrAmt))),
+ m_Value(Int))))
+ return false;
+
+ if (LShrAmt > ShlAmt)
+ return false;
+ Offset = ShlAmt - LShrAmt;
+
+ Mask = ShiftedMaskConst ? ShiftedMaskConst->shl(LShrAmt)
+ : APInt::getBitsSetFrom(
+ Int->getType()->getScalarSizeInBits(), LShrAmt);
+
+ return true;
+}
+
+/// Try to fold the join of two scalar integers whose bits are unpacked and
+/// zexted from the same source integer.
+static Value *foldIntegerRepackThroughZExt(Value *Lhs, Value *Rhs,
+ InstCombiner::BuilderTy &Builder) {
+
+ Value *LhsInt, *RhsInt;
+ APInt LhsMask, RhsMask;
+ uint64_t LhsOffset, RhsOffset;
+ bool IsLhsShlNUW, IsLhsShlNSW, IsRhsShlNUW, IsRhsShlNSW;
+ if (!matchZExtedSubInteger(Lhs, LhsInt, LhsMask, LhsOffset, IsLhsShlNUW,
+ IsLhsShlNSW))
+ return nullptr;
+ if (!matchZExtedSubInteger(Rhs, RhsInt, RhsMask, RhsOffset, IsRhsShlNUW,
+ IsRhsShlNSW))
+ return nullptr;
+ if (LhsInt != RhsInt || LhsOffset != RhsOffset)
+ return nullptr;
+
+ APInt Mask = LhsMask | RhsMask;
+
+ Type *DestTy = Lhs->getType();
+ Value *Res = Builder.CreateShl(
+ Builder.CreateZExt(
+ Builder.CreateAnd(LhsInt, Mask, LhsInt->getName() + ".mask"), DestTy,
+ LhsInt->getName() + ".zext"),
+ ConstantInt::get(DestTy, LhsOffset), "", IsLhsShlNUW && IsRhsShlNUW,
+ IsLhsShlNSW && IsRhsShlNSW);
+ Res->takeName(Lhs);
+ return Res;
+}
+
// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
// track these properities for preservation. Note that we can decompose
// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
@@ -3841,6 +3917,8 @@ static Value *foldBitmaskMul(Value *Op0, Value *Op1,
Value *InstCombinerImpl::foldDisjointOr(Value *LHS, Value *RHS) {
if (Value *Res = foldBitmaskMul(LHS, RHS, Builder))
return Res;
+ if (Value *Res = foldIntegerRepackThroughZExt(LHS, RHS, Builder))
+ return Res;
return nullptr;
}
@@ -3973,7 +4051,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
/*NSW=*/true, /*NUW=*/true))
return R;
- if (Value *Res = foldBitmaskMul(I.getOperand(0), I.getOperand(1), Builder))
+ if (Value *Res = foldDisjointOr(I.getOperand(0), I.getOperand(1)))
return replaceInstUsesWith(I, Res);
if (Value *Res = reassociateDisjointOr(I.getOperand(0), I.getOperand(1)))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 47e017e..2433534 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -267,12 +267,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
MI->getContext(), APInt::getSplat(Len * 8, FillC->getValue()));
StoreInst *S = Builder.CreateStore(FillVal, Dest, MI->isVolatile());
S->copyMetadata(*MI, LLVMContext::MD_DIAssignID);
- auto replaceOpForAssignmentMarkers = [FillC, FillVal](auto *DbgAssign) {
+ for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(S)) {
if (llvm::is_contained(DbgAssign->location_ops(), FillC))
DbgAssign->replaceVariableLocationOp(FillC, FillVal);
- };
- for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers);
- for_each(at::getDVRAssignmentMarkers(S), replaceOpForAssignmentMarkers);
+ }
S->setAlignment(Alignment);
if (MI->isAtomic())
@@ -1532,6 +1530,51 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V,
return nullptr;
}
+/// Helper to match idempotent binary intrinsics, namely, intrinsics where
+/// `f(f(x, y), y) == f(x, y)` holds.
+static bool isIdempotentBinaryIntrinsic(Intrinsic::ID IID) {
+ switch (IID) {
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ case Intrinsic::maximum:
+ case Intrinsic::minimum:
+ case Intrinsic::maximumnum:
+ case Intrinsic::minimumnum:
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Attempt to simplify value-accumulating recurrences of kind:
+/// %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
+/// %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+/// And let the idempotent binary intrinsic be hoisted, when the operands are
+/// known to be loop-invariant.
+static Value *foldIdempotentBinaryIntrinsicRecurrence(InstCombinerImpl &IC,
+ IntrinsicInst *II) {
+ PHINode *PN;
+ Value *Init, *OtherOp;
+
+ // A binary intrinsic recurrence with loop-invariant operands is equivalent to
+ // `call @llvm.binary.intrinsic(Init, OtherOp)`.
+ auto IID = II->getIntrinsicID();
+ if (!isIdempotentBinaryIntrinsic(IID) ||
+ !matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) ||
+ !IC.getDominatorTree().dominates(OtherOp, PN))
+ return nullptr;
+
+ auto *InvariantBinaryInst =
+ IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);
+ if (isa<FPMathOperator>(InvariantBinaryInst))
+ cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);
+ return InvariantBinaryInst;
+}
+
static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) {
if (!CanReorderLanes)
return nullptr;
@@ -3912,6 +3955,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
if (Value *Reverse = foldReversedIntrinsicOperands(II))
return replaceInstUsesWith(*II, Reverse);
+ if (Value *Res = foldIdempotentBinaryIntrinsicRecurrence(*this, II))
+ return replaceInstUsesWith(*II, Res);
+
// Some intrinsics (like experimental_gc_statepoint) can be used in invoke
// context, so it is handled in visitCallBase and we should trigger it.
return visitCallBase(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index a43a6ee..801ac00 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1131,11 +1131,10 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
case Instruction::Shl: {
// We can promote shl(x, cst) if we can promote x. Since shl overwrites the
// upper bits we can reduce BitsToClear by the shift amount.
- const APInt *Amt;
- if (match(I->getOperand(1), m_APInt(Amt))) {
+ uint64_t ShiftAmt;
+ if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) {
if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
return false;
- uint64_t ShiftAmt = Amt->getZExtValue();
BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
return true;
}
@@ -1144,11 +1143,11 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
case Instruction::LShr: {
// We can promote lshr(x, cst) if we can promote x. This requires the
// ultimate 'and' to clear out the high zero bits we're clearing out though.
- const APInt *Amt;
- if (match(I->getOperand(1), m_APInt(Amt))) {
+ uint64_t ShiftAmt;
+ if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) {
if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
return false;
- BitsToClear += Amt->getZExtValue();
+ BitsToClear += ShiftAmt;
if (BitsToClear > V->getType()->getScalarSizeInBits())
BitsToClear = V->getType()->getScalarSizeInBits();
return true;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cf94d28..2386e7a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1320,6 +1320,35 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
return nullptr;
}
+/// Fold icmp eq (num + mask) & ~mask, num
+/// to
+/// icmp eq (and num, mask), 0
+/// Where mask is a low bit mask.
+Instruction *InstCombinerImpl::foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp) {
+ Value *Num;
+ CmpPredicate Pred;
+ const APInt *Mask, *Neg;
+
+ if (!match(&Cmp,
+ m_c_ICmp(Pred, m_Value(Num),
+ m_OneUse(m_c_And(m_OneUse(m_c_Add(m_Deferred(Num),
+ m_LowBitMask(Mask))),
+ m_APInt(Neg))))))
+ return nullptr;
+
+ if (*Neg != ~*Mask)
+ return nullptr;
+
+ if (!ICmpInst::isEquality(Pred))
+ return nullptr;
+
+ // Create new icmp eq (num & mask), 0
+ auto *NewAnd = Builder.CreateAnd(Num, *Mask);
+ auto *Zero = Constant::getNullValue(Num->getType());
+
+ return new ICmpInst(Pred, NewAnd, Zero);
+}
+
/// Fold icmp Pred X, C.
/// TODO: This code structure does not make sense. The saturating add fold
/// should be moved to some other helper and extended as noted below (it is also
@@ -1521,11 +1550,11 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
// trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] < 0 --> ShOp < 0
// trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] > -1 --> ShOp > -1
Value *ShOp;
- const APInt *ShAmtC;
+ uint64_t ShAmt;
bool TrueIfSigned;
if (isSignBitCheck(Pred, C, TrueIfSigned) &&
- match(X, m_Shr(m_Value(ShOp), m_APInt(ShAmtC))) &&
- DstBits == SrcBits - ShAmtC->getZExtValue()) {
+ match(X, m_Shr(m_Value(ShOp), m_ConstantInt(ShAmt))) &&
+ DstBits == SrcBits - ShAmt) {
return TrueIfSigned ? new ICmpInst(ICmpInst::ICMP_SLT, ShOp,
ConstantInt::getNullValue(SrcTy))
: new ICmpInst(ICmpInst::ICMP_SGT, ShOp,
@@ -7644,6 +7673,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
if (Instruction *Res = foldICmpUsingKnownBits(I))
return Res;
+ if (Instruction *Res = foldIsMultipleOfAPowerOfTwo(I))
+ return Res;
+
// Test if the ICmpInst instruction is used exclusively by a select as
// part of a minimum or maximum operation. If so, refrain from doing
// any other folding. This helps out other analyses which understand
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index c67e27e..2340028 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -721,6 +721,7 @@ public:
Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+ Instruction *foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp);
Instruction *foldICmpUsingBoolRange(ICmpInst &I);
Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0be1034..4b10586 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -737,6 +737,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
".unpack");
NewLoad->setAAMetadata(LI.getAAMetadata());
+ // Copy invariant metadata from parent load.
+ NewLoad->copyMetadata(LI, LLVMContext::MD_invariant_load);
return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
PoisonValue::get(T), NewLoad, 0, Name));
}
@@ -764,6 +766,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
Name + ".unpack");
// Propagate AA metadata. It'll still be valid on the narrowed load.
L->setAAMetadata(LI.getAAMetadata());
+ // Copy invariant metadata from parent load.
+ L->copyMetadata(LI, LLVMContext::MD_invariant_load);
V = IC.Builder.CreateInsertValue(V, L, i);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index fe0f308..b17cf17 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
Value *V = LHS;
unsigned MaskElems = Mask.size();
auto *SrcTy = cast<FixedVectorType>(V->getType());
- unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue();
+ unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy);
unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
assert(SrcElemBitWidth && "vector elements must have a bitwidth");
unsigned SrcNumElems = SrcTy->getNumElements();
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8da65c5..50258af 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1211,23 +1211,19 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
return;
if (!II.isLifetimeStartOrEnd())
return;
- // Found lifetime intrinsic, add ASan instrumentation if necessary.
- auto *Size = cast<ConstantInt>(II.getArgOperand(0));
- // If size argument is undefined, don't do anything.
- if (Size->isMinusOne()) return;
- // Check that size doesn't saturate uint64_t and can
- // be stored in IntptrTy.
- const uint64_t SizeValue = Size->getValue().getLimitedValue();
- if (SizeValue == ~0ULL ||
- !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
- return;
// Find alloca instruction that corresponds to llvm.lifetime argument.
- AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(0));
// We're interested only in allocas we can handle.
if (!AI || !ASan.isInterestingAlloca(*AI))
return;
+
+ std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+ // Check that size is known and can be stored in IntptrTy.
+ if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size))
+ return;
+
bool DoPoison = (ID == Intrinsic::lifetime_end);
- AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+ AllocaPoisonCall APC = {&II, AI, *Size, DoPoison};
if (AI->isStaticAlloca())
StaticAllocaPoisonCallVec.push_back(APC);
else if (ClInstrumentDynamicAllocas)
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index bcb90d6..fc34d14 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1469,22 +1469,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
size_t Size = memtag::getAllocaSizeInBytes(*AI);
size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
- auto HandleLifetime = [&](IntrinsicInst *II) {
- // Set the lifetime intrinsic to cover the whole alloca. This reduces the
- // set of assumptions we need to make about the lifetime. Without this we
- // would need to ensure that we can track the lifetime pointer to a
- // constant offset from the alloca, and would still need to change the
- // size to include the extra alignment we use for the untagging to make
- // the size consistent.
- //
- // The check for standard lifetime below makes sure that we have exactly
- // one set of start / end in any execution (i.e. the ends are not
- // reachable from each other), so this will not cause any problems.
- II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
- };
- llvm::for_each(Info.LifetimeStart, HandleLifetime);
- llvm::for_each(Info.LifetimeEnd, HandleLifetime);
-
AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) {
auto *User = U.getUser();
return User != AILong && !isa<LifetimeIntrinsic>(User);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 7d3c940..948e2c6 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2690,6 +2690,54 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
SC.Done(&I);
}
+ // Perform a bitwise OR on the horizontal pairs (or other specified grouping)
+ // of elements.
+ //
+ // For example, suppose we have:
+ // VectorA: <a1, a2, a3, a4, a5, a6>
+ // VectorB: <b1, b2, b3, b4, b5, b6>
+ // ReductionFactor: 3.
+ // The output would be:
+ // <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6>
+ //
+ // This is convenient for instrumenting horizontal add/sub.
+ // For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
+ Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor,
+ Value *VectorA, Value *VectorB) {
+ assert(isa<FixedVectorType>(VectorA->getType()));
+ unsigned TotalNumElems =
+ cast<FixedVectorType>(VectorA->getType())->getNumElements();
+
+ if (VectorB) {
+ assert(VectorA->getType() == VectorB->getType());
+ TotalNumElems = TotalNumElems * 2;
+ }
+
+ assert(TotalNumElems % ReductionFactor == 0);
+
+ Value *Or = nullptr;
+
+ IRBuilder<> IRB(&I);
+ for (unsigned i = 0; i < ReductionFactor; i++) {
+ SmallVector<int, 16> Mask;
+ for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
+ Mask.push_back(X + i);
+
+ Value *Masked;
+ if (VectorB)
+ Masked = IRB.CreateShuffleVector(VectorA, VectorB, Mask);
+ else
+ Masked = IRB.CreateShuffleVector(VectorA, Mask);
+
+ if (Or)
+ Or = IRB.CreateOr(Or, Masked);
+ else
+ Or = Masked;
+ }
+
+ return Or;
+ }
+
/// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
/// fields.
///
@@ -2701,7 +2749,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
assert(I.getType()->isVectorTy());
assert(I.getArgOperand(0)->getType()->isVectorTy());
- FixedVectorType *ParamType =
+ [[maybe_unused]] FixedVectorType *ParamType =
cast<FixedVectorType>(I.getArgOperand(0)->getType());
assert((I.arg_size() != 2) ||
(ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType())));
@@ -2711,31 +2759,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
2 * ReturnType->getNumElements());
IRBuilder<> IRB(&I);
- unsigned Width = ParamType->getNumElements() * I.arg_size();
// Horizontal OR of shadow
- SmallVector<int, 8> EvenMask;
- SmallVector<int, 8> OddMask;
- for (unsigned X = 0; X < Width; X += 2) {
- EvenMask.push_back(X);
- OddMask.push_back(X + 1);
- }
-
Value *FirstArgShadow = getShadow(&I, 0);
- Value *EvenShadow;
- Value *OddShadow;
- if (I.arg_size() == 2) {
- Value *SecondArgShadow = getShadow(&I, 1);
- EvenShadow =
- IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
- OddShadow =
- IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
- } else {
- EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
- OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
- }
+ Value *SecondArgShadow = nullptr;
+ if (I.arg_size() == 2)
+ SecondArgShadow = getShadow(&I, 1);
+
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
+ SecondArgShadow);
- Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
setShadow(&I, OrShadow);
@@ -2768,23 +2801,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
- unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size();
FixedVectorType *ReinterpretShadowTy = nullptr;
assert(isAligned(Align(ReinterpretElemWidth),
ParamType->getPrimitiveSizeInBits()));
ReinterpretShadowTy = FixedVectorType::get(
IRB.getIntNTy(ReinterpretElemWidth),
ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth);
- TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size();
// Horizontal OR of shadow
- SmallVector<int, 8> EvenMask;
- SmallVector<int, 8> OddMask;
- for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
- EvenMask.push_back(X);
- OddMask.push_back(X + 1);
- }
-
Value *FirstArgShadow = getShadow(&I, 0);
FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy);
@@ -2796,22 +2820,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Align(2),
cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()));
- Value *EvenShadow;
- Value *OddShadow;
+ Value *SecondArgShadow = nullptr;
if (I.arg_size() == 2) {
- Value *SecondArgShadow = getShadow(&I, 1);
+ SecondArgShadow = getShadow(&I, 1);
SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
-
- EvenShadow =
- IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
- OddShadow =
- IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
- } else {
- EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
- OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
}
- Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
+ SecondArgShadow);
+
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
setShadow(&I, OrShadow);
@@ -3219,7 +3236,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// Caller guarantees that this intrinsic does not access memory.
///
/// TODO: "horizontal"/"pairwise" intrinsics are often incorrectly matched by
- /// by this handler.
+ /// by this handler. See horizontalReduce().
+ ///
+ /// TODO: permutation intrinsics are also often incorrectly matched.
[[maybe_unused]] bool
maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I,
unsigned int trailingFlags) {
@@ -3301,7 +3320,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleLifetimeStart(IntrinsicInst &I) {
if (!PoisonStack)
return;
- AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(0));
if (AI)
LifetimeStartList.push_back(std::make_pair(&I, AI));
}
@@ -3624,9 +3643,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // Get an MMX-sized vector type.
- Type *getMMXVectorTy(unsigned EltSizeInBits) {
- const unsigned X86_MMXSizeInBits = 64;
+ // Get an MMX-sized (64-bit) vector type, or optionally, other sized
+ // vectors.
+ Type *getMMXVectorTy(unsigned EltSizeInBits,
+ unsigned X86_MMXSizeInBits = 64) {
assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
"Illegal MMX vector element size");
return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3826,20 +3846,133 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // Instrument multiply-add intrinsic.
- void handleVectorPmaddIntrinsic(IntrinsicInst &I,
- unsigned MMXEltSizeInBits = 0) {
- Type *ResTy =
- MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+ // Instrument multiply-add(-accumulate)? intrinsics.
+ //
+ // e.g., Two operands:
+ // <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+ //
+ // Two operands which require an EltSizeInBits override:
+ // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+ //
+ // Three operands:
+ // <4 x i32> @llvm.x86.avx512.vpdpbusd.128
+ // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+ // (this is equivalent to multiply-add on %a and %b, followed by
+ // adding/"accumulating" %s. "Accumulation" stores the result in one
+ // of the source registers, but this accumulate vs. add distinction
+ // is lost when dealing with LLVM intrinsics.)
+ void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+ unsigned EltSizeInBits = 0) {
IRBuilder<> IRB(&I);
- auto *Shadow0 = getShadow(&I, 0);
- auto *Shadow1 = getShadow(&I, 1);
- Value *S = IRB.CreateOr(Shadow0, Shadow1);
- S = IRB.CreateBitCast(S, ResTy);
- S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
- ResTy);
- S = IRB.CreateBitCast(S, getShadowTy(&I));
- setShadow(&I, S);
+
+ [[maybe_unused]] FixedVectorType *ReturnType =
+ cast<FixedVectorType>(I.getType());
+ assert(isa<FixedVectorType>(ReturnType));
+
+ // Vectors A and B, and shadows
+ Value *Va = nullptr;
+ Value *Vb = nullptr;
+ Value *Sa = nullptr;
+ Value *Sb = nullptr;
+
+ assert(I.arg_size() == 2 || I.arg_size() == 3);
+ if (I.arg_size() == 2) {
+ Va = I.getOperand(0);
+ Vb = I.getOperand(1);
+
+ Sa = getShadow(&I, 0);
+ Sb = getShadow(&I, 1);
+ } else if (I.arg_size() == 3) {
+ // Operand 0 is the accumulator. We will deal with that below.
+ Va = I.getOperand(1);
+ Vb = I.getOperand(2);
+
+ Sa = getShadow(&I, 1);
+ Sb = getShadow(&I, 2);
+ }
+
+ FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
+ assert(ParamType == Vb->getType());
+
+ assert(ParamType->getPrimitiveSizeInBits() ==
+ ReturnType->getPrimitiveSizeInBits());
+
+ if (I.arg_size() == 3) {
+ assert(ParamType == ReturnType);
+ assert(ParamType == I.getArgOperand(0)->getType());
+ }
+
+ FixedVectorType *ImplicitReturnType = ReturnType;
+ // Step 1: instrument multiplication of corresponding vector elements
+ if (EltSizeInBits) {
+ ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
+ EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+ ParamType = cast<FixedVectorType>(
+ getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+
+ Va = IRB.CreateBitCast(Va, ParamType);
+ Vb = IRB.CreateBitCast(Vb, ParamType);
+
+ Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
+ Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
+ } else {
+ assert(ParamType->getNumElements() ==
+ ReturnType->getNumElements() * ReductionFactor);
+ }
+
+ // Multiplying an *initialized* zero by an uninitialized element results in
+ // an initialized zero element.
+ //
+ // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+ // results in an unpoisoned value. We can therefore adapt the visitAnd()
+ // instrumentation:
+ // OutShadow = (SaNonZero & SbNonZero)
+ // | (VaNonZero & SbNonZero)
+ // | (SaNonZero & VbNonZero)
+ // where non-zero is checked on a per-element basis (not per bit).
+ Value *SZero = Constant::getNullValue(Va->getType());
+ Value *VZero = Constant::getNullValue(Sa->getType());
+ Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
+ Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
+ Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
+ Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
+
+ Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+ Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+ Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+ // Each element of the vector is represented by a single bit (poisoned or
+ // not) e.g., <8 x i1>.
+ Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+
+ // Extend <8 x i1> to <8 x i16>.
+ // (The real pmadd intrinsic would have computed intermediate values of
+ // <8 x i32>, but that is irrelevant for our shadow purposes because we
+ // consider each element to be either fully initialized or fully
+ // uninitialized.)
+ And = IRB.CreateSExt(And, Sa->getType());
+
+ // Step 2: instrument horizontal add
+ // We don't need bit-precise horizontalReduce because we only want to check
+ // if each pair of elements is fully zero.
+ // Cast to <4 x i32>.
+ Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
+
+ // Compute <4 x i1>, then extend back to <4 x i32>.
+ Value *OutShadow = IRB.CreateSExt(
+ IRB.CreateICmpNE(Horizontal,
+ Constant::getNullValue(Horizontal->getType())),
+ ImplicitReturnType);
+
+ // Cast it back to the required fake return type (<1 x i64>).
+ if (EltSizeInBits)
+ OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+
+ // Step 3 (if applicable): instrument accumulator
+ if (I.arg_size() == 3)
+ OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));
+
+ setShadow(&I, OutShadow);
setOriginForNaryOp(I);
}
@@ -5374,21 +5507,185 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleVectorSadIntrinsic(I);
break;
+ // Multiply and Add Packed Words
+ // < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Packed Signed and Unsigned Bytes
+ // < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+ // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+ // <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+ //
+ // These intrinsics are auto-upgraded into non-masked forms:
+ // < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
+ // (<8 x i16>, <8 x i16>, <4 x i32>, i8)
+ // < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
+ // (<16 x i16>, <16 x i16>, <8 x i32>, i8)
+ // <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
+ // (<32 x i16>, <32 x i16>, <16 x i32>, i16)
+ // < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
+ // (<16 x i8>, <16 x i8>, <8 x i16>, i8)
+ // <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
+ // (<32 x i8>, <32 x i8>, <16 x i16>, i16)
+ // <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
+ // (<64 x i8>, <64 x i8>, <32 x i16>, i32)
case Intrinsic::x86_sse2_pmadd_wd:
case Intrinsic::x86_avx2_pmadd_wd:
+ case Intrinsic::x86_avx512_pmaddw_d_512:
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
case Intrinsic::x86_avx2_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I);
+ case Intrinsic::x86_avx512_pmaddubs_w_512:
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
break;
+ // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
case Intrinsic::x86_ssse3_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I, 8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
break;
+ // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
case Intrinsic::x86_mmx_pmadd_wd:
- handleVectorPmaddIntrinsic(I, 16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
break;
+ // AVX Vector Neural Network Instructions: bytes
+ //
+ // Multiply and Add Packed Signed and Unsigned Bytes
+ // < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // <16 x i32> @llvm.x86.avx512.vpdpbusd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ //
+ // Multiply and Add Unsigned and Signed Bytes With Saturation
+ // < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // <16 x i32> @llvm.x86.avx512.vpdpbusds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ //
+ // < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ //
+ // < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ //
+ // <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ //
+ // These intrinsics are auto-upgraded into non-masked forms:
+ // <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ //
+ // <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ case Intrinsic::x86_avx512_vpdpbusd_128:
+ case Intrinsic::x86_avx512_vpdpbusd_256:
+ case Intrinsic::x86_avx512_vpdpbusd_512:
+ case Intrinsic::x86_avx512_vpdpbusds_128:
+ case Intrinsic::x86_avx512_vpdpbusds_256:
+ case Intrinsic::x86_avx512_vpdpbusds_512:
+ case Intrinsic::x86_avx2_vpdpbssd_128:
+ case Intrinsic::x86_avx2_vpdpbssd_256:
+ case Intrinsic::x86_avx2_vpdpbssds_128:
+ case Intrinsic::x86_avx2_vpdpbssds_256:
+ case Intrinsic::x86_avx10_vpdpbssd_512:
+ case Intrinsic::x86_avx10_vpdpbssds_512:
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+ break;
+
+ // AVX Vector Neural Network Instructions: words
+ //
+ // Multiply and Add Signed Word Integers
+ // < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // <16 x i32> @llvm.x86.avx512.vpdpwssd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ //
+ // Multiply and Add Signed Word Integers With Saturation
+ // < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
+ // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
+ // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // <16 x i32> @llvm.x86.avx512.vpdpwssds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>)
+ //
+ // These intrinsics are auto-upgraded into non-masked forms:
+ // <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ //
+ // <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
+ // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
+ // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
+ // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ case Intrinsic::x86_avx512_vpdpwssd_128:
+ case Intrinsic::x86_avx512_vpdpwssd_256:
+ case Intrinsic::x86_avx512_vpdpwssd_512:
+ case Intrinsic::x86_avx512_vpdpwssds_128:
+ case Intrinsic::x86_avx512_vpdpwssds_256:
+ case Intrinsic::x86_avx512_vpdpwssds_512:
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ break;
+
+ // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
+ // Precision
+ // <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
+ // (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+ // <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
+ // (<8 x float>, <16 x bfloat>, <16 x bfloat>)
+ // <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
+ // (<16 x float>, <32 x bfloat>, <32 x bfloat>)
+ // handleVectorPmaddIntrinsic() currently only handles integer types.
+
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse_comieq_ss:
@@ -5603,6 +5900,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleAVXVpermi2var(I);
break;
+ // Packed Shuffle
+ // llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+ // llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>)
+ // llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+ // llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+ // llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
+ //
+ // The following intrinsics are auto-upgraded:
+ // llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
+ // llvm.x86.sse2.gpshufh.w(<8 x i16>, i8)
+ // llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_sse_pshuf_w:
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_ssse3_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(),
+ /*trailingVerbatimArgs=*/1);
+ break;
+
case Intrinsic::x86_avx512_mask_cvtps2dq_512: {
handleAVX512VectorConvertFPToInt(I);
break;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
index 6128581..f5b6686 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp
@@ -58,7 +58,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) {
uint64_t TrueCount, FalseCount = 0;
if (!PA.getSelectInstrProfile(*SI, TrueCount, FalseCount))
continue;
- setProfMetadata(F.getParent(), SI, {TrueCount, FalseCount},
+ setProfMetadata(SI, {TrueCount, FalseCount},
std::max(TrueCount, FalseCount));
}
if (succ_size(&BB) < 2)
@@ -67,7 +67,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) {
if (!PA.getOutgoingBranchWeights(BB, ProfileHolder, MaxCount))
continue;
assert(MaxCount > 0);
- setProfMetadata(F.getParent(), BB.getTerminator(), ProfileHolder, MaxCount);
+ setProfMetadata(BB.getTerminator(), ProfileHolder, MaxCount);
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 6f06a26..d9e850e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1727,7 +1727,7 @@ void PGOUseFunc::setBranchWeights() {
}
if (MaxCount)
- setProfMetadata(M, TI, EdgeCounts, MaxCount);
+ setProfMetadata(TI, EdgeCounts, MaxCount);
else {
// A zero MaxCount can come about when we have a BB with a positive
// count, and whose successor blocks all have 0 count. This can happen
@@ -1801,7 +1801,7 @@ void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
if (MaxCount)
- setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+ setProfMetadata(&SI, SCounts, MaxCount);
}
void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
@@ -2407,13 +2407,9 @@ static std::string getSimpleNodeName(const BasicBlock *Node) {
return SimpleNodeName;
}
-void llvm::setProfMetadata(Module *M, Instruction *TI,
- ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
- assert(MaxCount > 0 && "Bad max count");
- uint64_t Scale = calculateCountScale(MaxCount);
- SmallVector<unsigned, 4> Weights;
- for (const auto &ECI : EdgeCounts)
- Weights.push_back(scaleBranchCount(ECI, Scale));
+void llvm::setProfMetadata(Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+ uint64_t MaxCount) {
+ auto Weights = downscaleWeights(EdgeCounts, MaxCount);
LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
: Weights) {
@@ -2434,7 +2430,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
uint64_t TotalCount =
std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
[](uint64_t c1, uint64_t c2) { return c1 + c2; });
- Scale = calculateCountScale(WSum);
+ uint64_t Scale = calculateCountScale(WSum);
BranchProbability BP(scaleBranchCount(Weights[0], Scale),
scaleBranchCount(WSum, Scale));
std::string BranchProbStr;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index ce1d9f1..343bec3 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -432,7 +432,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
Updates.clear();
if (MaxCount)
- setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+ setProfMetadata(SI, CaseCounts, MaxCount);
LLVM_DEBUG(dbgs() << *BB << "\n");
LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 4edf25c..9471ae3 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -818,12 +818,12 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
}
}
} else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
- auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+ auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
if (!AI)
return false;
Size = GetAllocaSize(AI);
- Dest = II->getArgOperand(1);
+ Dest = II->getArgOperand(0);
} else if (auto *AI = dyn_cast<AllocaInst>(I)) {
// We need to clear the types for new stack allocations (or else we might
// read stale type information from a previous function execution).
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02..765059d 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -95,6 +95,7 @@ add_llvm_component_library(LLVMScalarOpts
Analysis
Core
InstCombine
+ ProfileData
Support
TransformUtils
)
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index a7ba54f..ac59ae1 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -447,7 +447,7 @@ private:
/// Also, collect select instructions to unfold.
bool isCandidate(const SwitchInst *SI) {
std::deque<std::pair<Value *, BasicBlock *>> Q;
- SmallSet<Value *, 16> SeenValues;
+ SmallPtrSet<Value *, 16> SeenValues;
SelectInsts.clear();
Value *SICond = SI->getCondition();
@@ -511,7 +511,7 @@ private:
void addToQueue(Value *Val, BasicBlock *BB,
std::deque<std::pair<Value *, BasicBlock *>> &Q,
- SmallSet<Value *, 16> &SeenValues) {
+ SmallPtrSet<Value *, 16> &SeenValues) {
if (SeenValues.insert(Val).second)
Q.push_back({Val, BB});
}
@@ -582,17 +582,15 @@ struct AllSwitchPaths {
VisitedBlocks VB;
// Get paths from the determinator BBs to SwitchPhiDefBB
std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
if (SwitchPhiDefBB == SwitchBlock) {
TPaths = std::move(PathsToPhiDef);
return;
}
- assert(MaxNumPaths >= PathsToPhiDef.size());
- auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
// Find and append paths from SwitchPhiDefBB to SwitchBlock.
PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
if (PathsToSwitchBB.empty())
return;
@@ -613,16 +611,13 @@ private:
typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
PHINode *Phi,
- VisitedBlocks &VB,
- unsigned PathsLimit) {
+ VisitedBlocks &VB) {
std::vector<ThreadingPath> Res;
auto *PhiBB = Phi->getParent();
VB.insert(PhiBB);
VisitedBlocks UniqueBlocks;
for (auto *IncomingBB : Phi->blocks()) {
- if (Res.size() >= PathsLimit)
- break;
if (!UniqueBlocks.insert(IncomingBB).second)
continue;
if (!SwitchOuterLoop->contains(IncomingBB))
@@ -658,9 +653,8 @@ private:
// Direct predecessor, just add to the path.
if (IncomingPhiDefBB == IncomingBB) {
- assert(PathsLimit > Res.size());
- std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
- StateDef, IncomingPhi, VB, PathsLimit - Res.size());
+ std::vector<ThreadingPath> PredPaths =
+ getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
for (ThreadingPath &Path : PredPaths) {
Path.push_back(PhiBB);
Res.push_back(std::move(Path));
@@ -673,17 +667,13 @@ private:
continue;
PathsType IntermediatePaths;
- assert(PathsLimit > Res.size());
- auto InterPathLimit = PathsLimit - Res.size();
- IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
- /* PathDepth = */ 1, InterPathLimit);
+ IntermediatePaths =
+ paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
if (IntermediatePaths.empty())
continue;
- assert(InterPathLimit >= IntermediatePaths.size());
- auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
+ getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
for (const ThreadingPath &Path : PredPaths) {
for (const PathType &IPath : IntermediatePaths) {
ThreadingPath NewPath(Path);
@@ -698,7 +688,7 @@ private:
}
PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
- unsigned PathDepth, unsigned PathsLimit) {
+ unsigned PathDepth) {
PathsType Res;
// Stop exploring paths after visiting MaxPathLength blocks
@@ -723,10 +713,8 @@ private:
// Some blocks have multiple edges to the same successor, and this set
// is used to prevent a duplicate path from being generated
- SmallSet<BasicBlock *, 4> Successors;
+ SmallPtrSet<BasicBlock *, 4> Successors;
for (BasicBlock *Succ : successors(BB)) {
- if (Res.size() >= PathsLimit)
- break;
if (!Successors.insert(Succ).second)
continue;
@@ -748,12 +736,14 @@ private:
// coverage and compile time.
if (LI->getLoopFor(Succ) != CurrLoop)
continue;
- assert(PathsLimit > Res.size());
- PathsType SuccPaths =
- paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
+
+ PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
for (PathType &Path : SuccPaths) {
Path.push_front(BB);
Res.push_back(Path);
+ if (Res.size() >= MaxNumPaths) {
+ return Res;
+ }
}
}
// This block could now be visited again from a different predecessor. Note
@@ -772,7 +762,7 @@ private:
SmallVector<PHINode *, 8> Stack;
Stack.push_back(FirstDef);
- SmallSet<Value *, 16> SeenValues;
+ SmallPtrSet<Value *, 16> SeenValues;
while (!Stack.empty()) {
PHINode *CurPhi = Stack.pop_back_val();
@@ -965,7 +955,7 @@ private:
DuplicateBlockMap DuplicateMap;
DefMap NewDefs;
- SmallSet<BasicBlock *, 16> BlocksToClean;
+ SmallPtrSet<BasicBlock *, 16> BlocksToClean;
BlocksToClean.insert_range(successors(SwitchBlock));
for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
@@ -994,7 +984,7 @@ private:
/// the predecessors, and phis in the successor blocks.
void createExitPath(DefMap &NewDefs, ThreadingPath &Path,
DuplicateBlockMap &DuplicateMap,
- SmallSet<BasicBlock *, 16> &BlocksToClean,
+ SmallPtrSet<BasicBlock *, 16> &BlocksToClean,
DomTreeUpdater *DTU) {
APInt NextState = Path.getExitValue();
const BasicBlock *Determinator = Path.getDeterminatorBB();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 9b87180..37004b9 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -38,6 +38,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -69,6 +70,7 @@
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -543,15 +545,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
};
// Insert an unlinked dbg.assign intrinsic for the dead fragment after each
- // overlapping dbg.assign intrinsic. The loop invalidates the iterators
- // returned by getAssignmentMarkers so save a copy of the markers to iterate
- // over.
- auto LinkedRange = at::getAssignmentMarkers(Inst);
- SmallVector<DbgVariableRecord *> LinkedDVRAssigns =
- at::getDVRAssignmentMarkers(Inst);
- SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
- LinkedRange.end());
- auto InsertAssignForOverlap = [&](auto *Assign) {
+ // overlapping dbg.assign intrinsic.
+ for (DbgVariableRecord *Assign : at::getDVRAssignmentMarkers(Inst)) {
std::optional<DIExpression::FragmentInfo> NewFragment;
if (!at::calculateFragmentIntersect(DL, OriginalDest, DeadSliceOffsetInBits,
DeadSliceSizeInBits, Assign,
@@ -561,11 +556,11 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
// cautious and unlink the whole assignment from the store.
Assign->setKillAddress();
Assign->setAssignId(GetDeadLink());
- return;
+ continue;
}
// No intersect.
if (NewFragment->SizeInBits == 0)
- return;
+ continue;
// Fragments overlap: insert a new dbg.assign for this dead part.
auto *NewAssign = static_cast<decltype(Assign)>(Assign->clone());
@@ -574,9 +569,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
if (NewFragment)
SetDeadFragExpr(NewAssign, *NewFragment);
NewAssign->setKillAddress();
- };
- for_each(Linked, InsertAssignForOverlap);
- for_each(LinkedDVRAssigns, InsertAssignForOverlap);
+ }
}
/// Update the attributes given that a memory access is updated (the
@@ -1363,7 +1356,7 @@ struct DSEState {
if (auto *CB = dyn_cast<CallBase>(I)) {
if (CB->getIntrinsicID() == Intrinsic::lifetime_end)
return {
- std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)};
+ std::make_pair(MemoryLocation::getForArgument(CB, 0, &TLI), false)};
if (Value *FreedOp = getFreedOperand(CB, &TLI))
return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
}
@@ -2666,3 +2659,79 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
PA.preserve<LoopAnalysis>();
return PA;
}
+
+namespace {
+
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ DSELegacyPass() : FunctionPass(ID) {
+ initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ PostDominatorTree &PDT =
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+
+#ifdef LLVM_ENABLE_STATS
+ if (AreStatisticsEnabled())
+ for (auto &I : instructions(F))
+ NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ }
+};
+
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+
+namespace llvm {
+LLVM_ABI FunctionPass *createDeadStoreEliminationPass() {
+ return new DSELegacyPass();
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 7704e49..4baa3b3 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -978,7 +978,7 @@ static bool IsValueFullyAvailableInBlock(
unsigned NumNewNewSpeculativelyAvailableBBs = 0;
#ifndef NDEBUG
- SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
+ SmallPtrSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
SmallVector<BasicBlock *, 32> AvailableBBs;
#endif
@@ -1222,7 +1222,7 @@ static bool liesBetween(const Instruction *From, Instruction *Between,
const Instruction *To, const DominatorTree *DT) {
if (From->getParent() == Between->getParent())
return DT->dominates(From, Between);
- SmallSet<BasicBlock *, 1> Exclusion;
+ SmallPtrSet<BasicBlock *, 1> Exclusion;
Exclusion.insert(Between->getParent());
return !isPotentiallyReachable(From, To, &Exclusion, DT);
}
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 3ba5b79..d99f1eb 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -642,9 +642,9 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig,
return FI;
}
- SmallSet<Value *, 16> Visited;
+ SmallPtrSet<Value *, 16> Visited;
SmallVector<Value *, 16> Worklist;
- SmallSet<Instruction *, 16> DropPoisonFlags;
+ SmallPtrSet<Instruction *, 16> DropPoisonFlags;
SmallVector<Value *, 16> NeedFreeze;
DenseMap<Value *, FreezeInst *> CacheOfFreezes;
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 334c911..6720cb1 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1613,7 +1613,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
if (CurrMaxExit == MaxBECount)
SkipLastIter = true;
};
- SmallSet<const SCEV *, 8> DominatingExactExitCounts;
+ SmallPtrSet<const SCEV *, 8> DominatingExactExitCounts;
for (BasicBlock *ExitingBB : ExitingBlocks) {
const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB);
const SCEV *MaxExitCount = SE->getExitCount(
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 85ee824..a097d33 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -434,7 +434,7 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
NewV = NewV->stripPointerCasts();
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
M, II->getIntrinsicID(), {NewV->getType()});
- II->setArgOperand(1, NewV);
+ II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
return true;
}
@@ -491,7 +491,7 @@ void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
- appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(1),
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
PostorderStack, Visited);
break;
}
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc231..e9bf59c 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
// Compute alignment from known bits.
+ auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) {
+ KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
+ unsigned TrailZ =
+ std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent);
+ return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ };
+
+ // Propagate alignment between loads and stores that originate from the
+ // same base pointer.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+ auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
+ APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
+ // Derive the base pointer alignment from the load/store alignment
+ // and the offset from the base pointer.
+ Align BasePointerAlign =
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ // If the stored base pointer alignment is better than the
+ // base pointer alignment we derived, we may be able to use it
+ // to improve the load/store alignment. If not, store the
+ // improved base pointer alignment for future iterations.
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign =
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ return BetterLoadStoreAlign;
+ }
+ It->second = BasePointerAlign;
+ }
+ return LoadStoreAlign;
+ };
+
for (BasicBlock &BB : F) {
+ // We need to reset the map for each block because alignment information
+ // can only be propagated from instruction A to B if A dominates B.
+ // This is because control flow (and exception throwing) could be dependent
+ // on the address (and its alignment) at runtime. Some sort of dominator
+ // tree approach could be better, but doing a simple forward pass through a
+ // single basic block is correct too.
+ BestBasePointerAligns.clear();
+
for (Instruction &I : BB) {
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
- KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
- unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
- +Value::MaxAlignmentExponent);
- return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ return std::max(InferFromKnownBits(I, PtrOp),
+ InferFromBasePointer(PtrOp, OldAlign));
});
}
}
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 7f99cd2..9d915d0 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -7,14 +7,23 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/CtxProfAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <limits>
using namespace llvm;
@@ -33,6 +42,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
"or equal than this threshold."),
cl::init(50));
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
#define DEBUG_TYPE "jump-table-to-switch"
namespace {
@@ -90,9 +101,11 @@ static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
return JumpTable;
}
-static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
- DomTreeUpdater &DTU,
- OptimizationRemarkEmitter &ORE) {
+static BasicBlock *
+expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
+ OptimizationRemarkEmitter &ORE,
+ llvm::function_ref<GlobalValue::GUID(const Function &)>
+ GetGuidForFunction) {
const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext());
SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
@@ -115,7 +128,30 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
IRBuilder<> BuilderTail(CB);
PHINode *PHI =
IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size());
+ const auto *ProfMD = CB->getMetadata(LLVMContext::MD_prof);
+
+ SmallVector<uint64_t> BranchWeights;
+ DenseMap<GlobalValue::GUID, uint64_t> GuidToCounter;
+ const bool HadProfile = isValueProfileMD(ProfMD);
+ if (HadProfile) {
+ // The assumptions, coming in, are that the functions in JT.Funcs are
+ // defined in this module (from parseJumpTable).
+ assert(llvm::all_of(
+ JT.Funcs, [](const Function *F) { return F && !F->isDeclaration(); }));
+ BranchWeights.reserve(JT.Funcs.size() + 1);
+ // The first is the default target, which is the unreachable block created
+ // above.
+ BranchWeights.push_back(0U);
+ uint64_t TotalCount = 0;
+ auto Targets = getValueProfDataFromInst(
+ *CB, InstrProfValueKind::IPVK_IndirectCallTarget,
+ std::numeric_limits<uint32_t>::max(), TotalCount);
+ for (const auto &[G, C] : Targets) {
+ [[maybe_unused]] auto It = GuidToCounter.insert({G, C});
+ assert(It.second);
+ }
+ }
for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {
BasicBlock *B = BasicBlock::Create(Func->getContext(),
"call." + Twine(Index), &F, Tail);
@@ -123,10 +159,19 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
DTUpdates.push_back({DominatorTree::Insert, B, Tail});
CallBase *Call = cast<CallBase>(CB->clone());
+ // The MD_prof metadata (VP kind), if it existed, can be dropped, it doesn't
+ // make sense on a direct call. Note that the values are used for the branch
+ // weights of the switch.
+ Call->setMetadata(LLVMContext::MD_prof, nullptr);
Call->setCalledFunction(Func);
Call->insertInto(B, B->end());
Switch->addCase(
cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B);
+ GlobalValue::GUID FctID = GetGuidForFunction(*Func);
+ // It'd be OK to _not_ find target functions in GuidToCounter, e.g. suppose
+ // just some of the jump targets are taken (for the given profile).
+ BranchWeights.push_back(FctID == 0U ? 0U
+ : GuidToCounter.lookup_or(FctID, 0U));
BranchInst::Create(Tail, B);
if (PHI)
PHI->addIncoming(Call, B);
@@ -136,6 +181,13 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB)
<< "expanded indirect call into switch";
});
+ if (HadProfile && !ProfcheckDisableMetadataFixes) {
+ // At least one of the targets must've been taken.
+ assert(llvm::any_of(BranchWeights, [](uint64_t V) { return V != 0; }));
+ setBranchWeights(*Switch, downscaleWeights(BranchWeights),
+ /*IsExpected=*/false);
+ } else
+ setExplicitlyUnknownBranchWeights(*Switch);
if (PHI)
CB->replaceAllUsesWith(PHI);
CB->eraseFromParent();
@@ -150,6 +202,15 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
bool Changed = false;
+ InstrProfSymtab Symtab;
+ if (auto E = Symtab.create(*F.getParent()))
+ F.getContext().emitError(
+ "Could not create indirect call table, likely corrupted IR" +
+ toString(std::move(E)));
+ DenseMap<const Function *, GlobalValue::GUID> FToGuid;
+ for (const auto &[G, FPtr] : Symtab.getIDToNameMap())
+ FToGuid.insert({FPtr, G});
+
for (BasicBlock &BB : make_early_inc_range(F)) {
BasicBlock *CurrentBB = &BB;
while (CurrentBB) {
@@ -170,7 +231,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
if (!JumpTable)
continue;
- SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE);
+ SplittedOutTail = expandToSwitch(
+ Call, *JumpTable, DTU, ORE, [&](const Function &Fct) {
+ if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
+ return AssignGUIDPass::getGUID(Fct);
+ return FToGuid.lookup_or(&Fct, 0U);
+ });
Changed = true;
break;
}
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index c3f80f9..e157cc9 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -169,6 +169,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
"number of accesses allowed to be present in a loop in order to "
"enable memory promotion."));
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
@@ -472,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode,
- LicmAllowSpeculation);
+ LicmAllowSpeculation, HasCoroSuspendInst);
// Now that all loop invariants have been removed from the loop, promote any
// memory references to scalars that we can.
@@ -857,9 +859,18 @@ public:
}
// Now finally clone BI.
- ReplaceInstWithInst(
- HoistTarget->getTerminator(),
- BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
+ auto *NewBI =
+ BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition(),
+ HoistTarget->getTerminator()->getIterator());
+ HoistTarget->getTerminator()->eraseFromParent();
+ // md_prof should also come from the original branch - since the
+ // condition was hoisted, the branch probabilities shouldn't change.
+ if (!ProfcheckDisableMetadataFixes)
+ NewBI->copyMetadata(*BI, {LLVMContext::MD_prof});
+ // FIXME: Issue #152767: debug info should also be the same as the
+ // original branch, **if** the user explicitly indicated that.
+ NewBI->setDebugLoc(HoistTarget->getTerminator()->getDebugLoc());
+
++NumClonedBranches;
assert(CurLoop->getLoopPreheader() &&
@@ -881,7 +892,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
ICFLoopSafetyInfo *SafetyInfo,
SinkAndHoistLICMFlags &Flags,
OptimizationRemarkEmitter *ORE, bool LoopNestMode,
- bool AllowSpeculation) {
+ bool AllowSpeculation, bool HasCoroSuspendInst) {
// Verify inputs.
assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
CurLoop != nullptr && SafetyInfo != nullptr &&
@@ -914,11 +925,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
// TODO: It may be safe to hoist if we are hoisting to a conditional block
// and we have accurately duplicated the control flow from the loop header
// to that block.
- if (CurLoop->hasLoopInvariantOperands(&I) &&
+ if (CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) &&
- isSafeToExecuteUnconditionally(
- I, DT, TLI, CurLoop, SafetyInfo, ORE,
- Preheader->getTerminator(), AC, AllowSpeculation)) {
+ isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE,
+ Preheader->getTerminator(), AC,
+ AllowSpeculation)) {
hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
MSSAU, SE, ORE);
HoistedInstructions.push_back(&I);
@@ -964,7 +975,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
};
if ((IsInvariantStart(I) || isGuard(&I)) &&
- CurLoop->hasLoopInvariantOperands(&I) &&
+ CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) &&
MustExecuteWithoutWritesBefore(I)) {
hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
MSSAU, SE, ORE);
@@ -1230,11 +1241,16 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
if (Behavior.doesNotAccessMemory())
return true;
if (Behavior.onlyReadsMemory()) {
+ // Might have stale MemoryDef for call that was later inferred to be
+ // read-only.
+ auto *MU = dyn_cast<MemoryUse>(MSSA->getMemoryAccess(CI));
+ if (!MU)
+ return false;
+
// If we can prove there are no writes to the memory read by the call, we
// can hoist or sink.
return !pointerInvalidatedByLoop(
- MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I, Flags,
- /*InvariantGroup=*/false);
+ MSSA, MU, CurLoop, I, Flags, /*InvariantGroup=*/false);
}
if (Behavior.onlyWritesMemory()) {
@@ -1688,8 +1704,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
// The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
// time in isGuaranteedToExecute if we don't actually have anything to
// drop. It is a compile time optimization, not required for correctness.
- !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
- I.dropUBImplyingAttrsAndMetadata();
+ !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
+ if (ProfcheckDisableMetadataFixes)
+ I.dropUBImplyingAttrsAndMetadata();
+ else
+ I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof});
+ }
if (isa<PHINode>(I))
// Move the new node to the end of the phi list in the destination block.
@@ -2856,7 +2876,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
bool LVInRHS = L.isLoopInvariant(BO->getOperand(0));
auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS));
if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() ||
- BO0->hasNUsesOrMore(3))
+ BO0->hasNUsesOrMore(BO0->getType()->isIntegerTy() ? 2 : 3))
return false;
Value *LV = BO0->getOperand(0);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0ac1a15..27d3004 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -502,8 +502,10 @@ public:
SmallVector<int, 8> PtrToPartitions(N);
for (unsigned I = 0; I < N; ++I) {
Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
- auto Instructions =
- LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+ auto Instructions = LAI.getInstructionsForAccess(Ptr, /* IsWrite */ true);
+ auto ReadInstructions =
+ LAI.getInstructionsForAccess(Ptr, /* IsWrite */ false);
+ Instructions.append(ReadInstructions.begin(), ReadInstructions.end());
int &Partition = PtrToPartitions[I];
// First set it to uninitialized.
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b3bffeb..5795c76 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) {
case llvm::Instruction::FPExt:
return true;
case llvm::Instruction::AddrSpaceCast:
+ case CastInst::PtrToAddr:
case CastInst::PtrToInt:
case CastInst::IntToPtr:
return false;
@@ -1208,7 +1209,7 @@ public:
//
// For verification, we keep track of where we changed uses to poison in
// PoisonedInsts and then check that we in fact remove them.
- SmallSet<Instruction *, 16> PoisonedInsts;
+ SmallPtrSet<Instruction *, 16> PoisonedInsts;
for (auto *Inst : reverse(ToRemove)) {
for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))
@@ -2166,7 +2167,7 @@ public:
// If the loads don't alias the lifetime.end, it won't interfere with
// fusion.
- MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);
+ MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr);
if (!EndLoc.Ptr)
continue;
if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 79721dc..e043d07 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -915,7 +915,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// move the bitcast as well, which we don't handle.
if (SkippedLifetimeStart) {
auto *LifetimeArg =
- dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1));
+ dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(0));
if (LifetimeArg && LifetimeArg->getParent() == C->getParent() &&
C->comesBefore(LifetimeArg))
return false;
@@ -1010,7 +1010,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// Lifetime of srcAlloca ends at lifetime.end.
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
- II->getArgOperand(1) == srcAlloca)
+ II->getArgOperand(0) == srcAlloca)
break;
}
@@ -1393,7 +1393,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst()))
if (II->getIntrinsicID() == Intrinsic::lifetime_start)
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V)))
- return II->getArgOperand(1) == Alloca;
+ return II->getArgOperand(0) == Alloca;
return false;
}
@@ -1530,7 +1530,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
// to remove them.
SmallVector<Instruction *, 4> LifetimeMarkers;
- SmallSet<Instruction *, 4> AAMetadataInstrs;
+ SmallPtrSet<Instruction *, 4> AAMetadataInstrs;
bool SrcNotDom = false;
auto CaptureTrackingWithModRef =
@@ -1540,7 +1540,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
Worklist.push_back(AI);
unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
Worklist.reserve(MaxUsesToExplore);
- SmallSet<const Use *, 20> Visited;
+ SmallPtrSet<const Use *, 20> Visited;
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
for (const Use &U : I->uses()) {
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 1a52af1..9d4fb79 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -651,7 +651,7 @@ class NewGVN {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
- mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice;
+ mutable DenseMap<const BitCastInst *, const Value *> PredicateSwapChoice;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -819,7 +819,7 @@ private:
BasicBlock *PHIBlock) const;
const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
ExprResult performSymbolicCmpEvaluation(Instruction *) const;
- ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const;
+ ExprResult performSymbolicPredicateInfoEvaluation(BitCastInst *) const;
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -841,7 +841,7 @@ private:
unsigned int getRank(const Value *) const;
bool shouldSwapOperands(const Value *, const Value *) const;
bool shouldSwapOperandsForPredicate(const Value *, const Value *,
- const IntrinsicInst *I) const;
+ const BitCastInst *I) const;
// Reachability handling.
void updateReachableEdge(BasicBlock *, BasicBlock *);
@@ -1013,9 +1013,9 @@ void NewGVN::deleteExpression(const Expression *E) const {
// If V is a predicateinfo copy, get the thing it is a copy of.
static Value *getCopyOf(const Value *V) {
- if (auto *II = dyn_cast<IntrinsicInst>(V))
- if (II->getIntrinsicID() == Intrinsic::ssa_copy)
- return II->getOperand(0);
+ if (auto *BC = dyn_cast<BitCastInst>(V))
+ if (BC->getType() == BC->getOperand(0)->getType())
+ return BC->getOperand(0);
return nullptr;
}
@@ -1535,7 +1535,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
- auto *LifetimePtr = II->getOperand(1);
+ auto *LifetimePtr = II->getOperand(0);
if (LoadPtr == lookupOperandLeader(LifetimePtr) ||
AA->isMustAlias(LoadPtr, LifetimePtr))
return createConstantExpression(UndefValue::get(LoadType));
@@ -1604,7 +1604,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
}
NewGVN::ExprResult
-NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
+NewGVN::performSymbolicPredicateInfoEvaluation(BitCastInst *I) const {
auto *PI = PredInfo->getPredicateInfoFor(I);
if (!PI)
return ExprResult::none();
@@ -1647,13 +1647,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- // Intrinsics with the returned attribute are copies of arguments.
- if (auto *ReturnedValue = II->getReturnedArgOperand()) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy)
- if (auto Res = performSymbolicPredicateInfoEvaluation(II))
- return Res;
+ if (auto *ReturnedValue = II->getReturnedArgOperand())
return ExprResult::some(createVariableOrConstant(ReturnedValue));
- }
}
// FIXME: Currently the calls which may access the thread id may
@@ -2032,6 +2027,12 @@ NewGVN::performSymbolicEvaluation(Instruction *I,
E = performSymbolicLoadEvaluation(I);
break;
case Instruction::BitCast:
+ // Intrinsics with the returned attribute are copies of arguments.
+ if (I->getType() == I->getOperand(0)->getType())
+ if (auto Res =
+ performSymbolicPredicateInfoEvaluation(cast<BitCastInst>(I)))
+ return Res;
+ [[fallthrough]];
case Instruction::AddrSpaceCast:
case Instruction::Freeze:
return createExpression(I);
@@ -4075,8 +4076,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
if (DominatingLeader != Def) {
// Even if the instruction is removed, we still need to update
// flags/metadata due to downstreams users of the leader.
- if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>()))
- patchReplacementInstruction(DefI, DominatingLeader);
+ patchReplacementInstruction(DefI, DominatingLeader);
SmallVector<DbgVariableRecord *> DVRUsers;
findDbgUsers(DefI, DVRUsers);
@@ -4116,10 +4116,14 @@ bool NewGVN::eliminateInstructions(Function &F) {
Value *DominatingLeader = EliminationStack.back();
- auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
- bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
- if (isSSACopy)
- DominatingLeader = II->getOperand(0);
+ Instruction *SSACopy = nullptr;
+ if (auto *BC = dyn_cast<BitCastInst>(DominatingLeader)) {
+ if (BC->getType() == BC->getOperand(0)->getType() &&
+ PredInfo->getPredicateInfoFor(DominatingLeader)) {
+ SSACopy = BC;
+ DominatingLeader = BC->getOperand(0);
+ }
+ }
// Don't replace our existing users with ourselves.
if (U->get() == DominatingLeader)
@@ -4145,12 +4149,12 @@ bool NewGVN::eliminateInstructions(Function &F) {
ProbablyDead.erase(cast<Instruction>(DominatingLeader));
// For copy instructions, we use their operand as a leader,
// which means we remove a user of the copy and it may become dead.
- if (isSSACopy) {
- auto It = UseCounts.find(II);
+ if (SSACopy) {
+ auto It = UseCounts.find(SSACopy);
if (It != UseCounts.end()) {
unsigned &IIUseCount = It->second;
if (--IIUseCount == 0)
- ProbablyDead.insert(II);
+ ProbablyDead.insert(SSACopy);
}
}
++LeaderUseCount;
@@ -4251,7 +4255,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
}
bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B,
- const IntrinsicInst *I) const {
+ const BitCastInst *I) const {
if (shouldSwapOperands(A, B)) {
PredicateSwapChoice[I] = B;
return true;
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 343da5b2..ba58b8e 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -878,7 +878,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
// only that it mostly looks like one.
static bool isLoadCombineCandidate(Instruction *Or) {
SmallVector<Instruction *, 8> Worklist;
- SmallSet<Instruction *, 8> Visited;
+ SmallPtrSet<Instruction *, 8> Visited;
auto Enqueue = [&](Value *V) {
auto *I = dyn_cast<Instruction>(V);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index d9805d8..8b15445 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2309,8 +2309,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
// Cost of the address calculation
- Type *ValTy = GEP->getSourceElementType();
- Cost += TTI.getAddressComputationCost(ValTy);
+ Cost += TTI.getAddressComputationCost(
+ GEP->getType(), nullptr, nullptr,
+ TargetTransformInfo::TCK_SizeAndLatency);
// And cost of the GEP itself
// TODO: Use TTI->getGEPCost here (it exists, but appears to be not
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 8be2f78..feee794 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -31,6 +32,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -53,12 +55,15 @@ STATISTIC(NumInstReplaced,
// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
// and return true if the function was modified.
static bool runSCCP(Function &F, const DataLayout &DL,
- const TargetLibraryInfo *TLI, DomTreeUpdater &DTU) {
+ const TargetLibraryInfo *TLI, DominatorTree &DT,
+ AssumptionCache &AC) {
LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
SCCPSolver Solver(
DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
F.getContext());
+ Solver.addPredicateInfo(F, DT, AC);
+
// While we don't do any actual inter-procedural analysis, still track
// return values so we can infer attributes.
if (canTrackReturnsInterprocedurally(&F))
@@ -101,6 +106,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
}
// Remove unreachable blocks and non-feasible edges.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
for (BasicBlock *DeadBB : BlocksToErase)
NumInstRemoved += changeToUnreachable(&*DeadBB->getFirstNonPHIIt(),
/*PreserveLCSSA=*/false, &DTU);
@@ -113,6 +119,8 @@ static bool runSCCP(Function &F, const DataLayout &DL,
if (!DeadBB->hasAddressTaken())
DTU.deleteBB(DeadBB);
+ Solver.removeSSACopies(F);
+
Solver.inferReturnAttributes();
return MadeChanges;
@@ -121,9 +129,9 @@ static bool runSCCP(Function &F, const DataLayout &DL,
PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
const DataLayout &DL = F.getDataLayout();
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- if (!runSCCP(F, DL, &TLI, DTU))
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ if (!runSCCP(F, DL, &TLI, DT, AC))
return PreservedAnalyses::all();
auto PA = PreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 03d9f32..06a92bd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -320,15 +320,6 @@ static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
DVR->getDebugLoc().getInlinedAt());
}
-DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
- (void)Unused;
- return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
-}
-DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
- (void)Unused;
- return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
-}
-
/// Find linked dbg.assign and generate a new one with the correct
/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
/// value component is copied from the old dbg.assign to the new.
@@ -348,10 +339,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
uint64_t SliceSizeInBits, Instruction *OldInst,
Instruction *Inst, Value *Dest, Value *Value,
const DataLayout &DL) {
- auto MarkerRange = at::getAssignmentMarkers(OldInst);
auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
// Nothing to do if OldInst has no linked dbg.assign intrinsics.
- if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
+ if (DVRAssignMarkerRange.empty())
return;
LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
@@ -435,11 +425,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
}
::Value *NewValue = Value ? Value : DbgAssign->getValue();
- auto *NewAssign = UnwrapDbgInstPtr(
+ DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
Dest, DIExpression::get(Expr->getContext(), {}),
- DbgAssign->getDebugLoc()),
- DbgAssign);
+ DbgAssign->getDebugLoc())));
// If we've updated the value but the original dbg.assign has an arglist
// then kill it now - we can't use the requested new value.
@@ -1260,10 +1249,7 @@ private:
return PI.setAborted(&II);
if (II.isLifetimeStartOrEnd()) {
- ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
- uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
- Length->getLimitedValue());
- insertUse(II, Offset, Size, true);
+ insertUse(II, Offset, AllocSize, true);
return;
}
@@ -3235,8 +3221,7 @@ private:
// In theory we should call migrateDebugInfo here. However, we do not
// emit dbg.assign intrinsics for mem intrinsics storing through non-
// constant geps, or storing a variable number of bytes.
- assert(at::getAssignmentMarkers(&II).empty() &&
- at::getDVRAssignmentMarkers(&II).empty() &&
+ assert(at::getDVRAssignmentMarkers(&II).empty() &&
"AT: Unexpected link to non-const GEP");
deleteIfTriviallyDead(OldPtr);
return false;
@@ -3385,13 +3370,11 @@ private:
Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
if (IsDest) {
// Update the address component of linked dbg.assigns.
- auto UpdateAssignAddress = [&](auto *DbgAssign) {
+ for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
DbgAssign->getAddress() == II.getDest())
DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
- };
- for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
- for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
+ }
II.setDest(AdjustedPtr);
II.setDestAlignment(SliceAlign);
} else {
@@ -3614,30 +3597,14 @@ private:
return true;
}
- assert(II.getArgOperand(1) == OldPtr);
- // Lifetime intrinsics are only promotable if they cover the whole alloca.
- // Therefore, we drop lifetime intrinsics which don't cover the whole
- // alloca.
- // (In theory, intrinsics which partially cover an alloca could be
- // promoted, but PromoteMemToReg doesn't handle that case.)
- // FIXME: Check whether the alloca is promotable before dropping the
- // lifetime intrinsics?
- if (NewBeginOffset != NewAllocaBeginOffset ||
- NewEndOffset != NewAllocaEndOffset)
- return true;
-
- ConstantInt *Size =
- ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
- NewEndOffset - NewBeginOffset);
- // Lifetime intrinsics always expect an i8* so directly get such a pointer
- // for the new alloca slice.
+ assert(II.getArgOperand(0) == OldPtr);
Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
Value *New;
if (II.getIntrinsicID() == Intrinsic::lifetime_start)
- New = IRB.CreateLifetimeStart(Ptr, Size);
+ New = IRB.CreateLifetimeStart(Ptr);
else
- New = IRB.CreateLifetimeEnd(Ptr, Size);
+ New = IRB.CreateLifetimeEnd(Ptr);
(void)New;
LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
@@ -4005,8 +3972,7 @@ private:
Store->getPointerOperand(), Store->getValueOperand(),
DL);
} else {
- assert(at::getAssignmentMarkers(Store).empty() &&
- at::getDVRAssignmentMarkers(Store).empty() &&
+ assert(at::getDVRAssignmentMarkers(Store).empty() &&
"AT: unexpected debug.assign linked to store through "
"unbounded GEP");
}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e..032a3a7 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -37,6 +37,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeMergeICmpsLegacyPassPass(Registry);
initializeNaryReassociateLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+ initializeDSELegacyPassPass(Registry);
initializeReassociateLegacyPassPass(Registry);
initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6ffe841..fc96589 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -294,6 +294,10 @@ private:
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
+ /// Analyze XOR instruction to extract disjoint constant bits that behave
+ /// like addition operations for improved address mode folding.
+ APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
+
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
/// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@@ -596,6 +600,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ // Handle XOR with disjoint bits that can be treated as addition.
+ else if (BO->getOpcode() == Instruction::Xor)
+ ConstantOffset = extractDisjointBitsFromXor(BO);
} else if (isa<TruncInst>(V)) {
ConstantOffset =
find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -708,11 +715,20 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
Value *NextInChain = removeConstOffset(ChainIndex - 1);
Value *TheOther = BO->getOperand(1 - OpNo);
- // If NextInChain is 0 and not the LHS of a sub, we can simplify the
- // sub-expression to be just TheOther.
if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
- if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
+ if (CI->isZero()) {
+ // Custom XOR handling for disjoint bits - preserves original XOR
+ // with non-disjoint constant bits.
+ // TODO: The design should be updated to support partial constant
+ // extraction.
+ if (BO->getOpcode() == Instruction::Xor)
+ return BO;
+
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+ }
}
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@@ -743,6 +759,67 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
+/// Analyze XOR instruction to extract disjoint constant bits for address
+/// folding
+///
+/// This function identifies bits in an XOR constant operand that are disjoint
+/// from the base operand's known set bits. For these disjoint bits, XOR behaves
+/// identically to addition, allowing us to extract them as constant offsets
+/// that can be folded into addressing modes.
+///
+/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
+/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
+///
+/// Example with ptr having known-zero low bit:
+/// Original: `xor %ptr, 3` ; 3 = 0b11
+/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
+/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode
+///
+/// \param XorInst The XOR binary operator to analyze
+/// \return APInt containing the disjoint bits that can be extracted as offset,
+/// or zero if no disjoint bits exist
+APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
+ BinaryOperator *XorInst) {
+ assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
+ "Expected XOR instruction");
+
+ const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
+ Value *BaseOperand;
+ ConstantInt *XorConstant;
+
+ // Match pattern: xor BaseOperand, Constant.
+ if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+ return APInt::getZero(BitWidth);
+
+ // Compute known bits for the base operand.
+ const SimplifyQuery SQ(DL);
+ const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
+ const APInt &ConstantValue = XorConstant->getValue();
+
+ // Identify disjoint bits: constant bits that are known zero in base.
+ const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
+
+ // Early exit if no disjoint bits found.
+ if (DisjointBits.isZero())
+ return APInt::getZero(BitWidth);
+
+ // Compute the remaining non-disjoint bits that stay in the XOR.
+ const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
+
+ // FIXME: Enhance XOR constant extraction to handle nested binary operations.
+ // Currently we only extract disjoint bits from the immediate XOR constant,
+ // but we could recursively process cases like:
+ // xor (add %base, C1), C2 -> add %base, (C1 ^ disjoint_bits(C2))
+ // This requires careful analysis to ensure the transformation preserves
+ // semantics, particularly around sign extension and overflow behavior.
+
+ // Add the non-disjoint constant to the user chain for later transformation
+ // This will replace the original constant in the XOR with the new
+ // constant.
+ UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
+ return DisjointBits;
+}
+
/// A helper function to check if reassociating through an entry in the user
/// chain would invalidate the GEP's nuw flag.
static bool allowsPreservingNUW(const User *U) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index f6959ca2..9b40fc0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2144,23 +2144,9 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
bool CurrentLoopValid, bool PartiallyInvariant,
bool InjectedCondition, ArrayRef<Loop *> NewLoops) {
- auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag,
- StringRef DisableTag) {
- auto &Ctx = TargetLoop->getHeader()->getContext();
- MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag));
- MDNode *NewLoopID = makePostTransformationMetadata(
- Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD});
- TargetLoop->setLoopID(NewLoopID);
- };
-
- // If we performed a non-trivial unswitch, we have added new cloned loops.
- // Mark such newly-created loops as visited.
- if (!NewLoops.empty()) {
- for (Loop *NL : NewLoops)
- RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial",
- "llvm.loop.unswitch.nontrivial.disable");
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ if (!NewLoops.empty())
U.addSiblingLoops(NewLoops);
- }
// If the current loop remains valid, we should revisit it to catch any
// other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2168,12 +2154,24 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
if (PartiallyInvariant) {
// Mark the new loop as partially unswitched, to avoid unswitching on
// the same condition again.
- RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial",
- "llvm.loop.unswitch.partial.disable");
+ auto &Context = L.getHeader()->getContext();
+ MDNode *DisableUnswitchMD = MDNode::get(
+ Context,
+ MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
+ MDNode *NewLoopID = makePostTransformationMetadata(
+ Context, L.getLoopID(), {"llvm.loop.unswitch.partial"},
+ {DisableUnswitchMD});
+ L.setLoopID(NewLoopID);
} else if (InjectedCondition) {
// Do the same for injection of invariant conditions.
- RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection",
- "llvm.loop.unswitch.injection.disable");
+ auto &Context = L.getHeader()->getContext();
+ MDNode *DisableUnswitchMD = MDNode::get(
+ Context,
+ MDString::get(Context, "llvm.loop.unswitch.injection.disable"));
+ MDNode *NewLoopID = makePostTransformationMetadata(
+ Context, L.getLoopID(), {"llvm.loop.unswitch.injection"},
+ {DisableUnswitchMD});
+ L.setLoopID(NewLoopID);
} else
U.revisitCurrentLoop();
} else
@@ -2811,9 +2809,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
-/// candidates available. Also consider the number of "sibling" loops with
-/// the idea of accounting for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -3509,9 +3507,8 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
IVConditionInfo PartialIVInfo;
Instruction *PartialIVCondBranch = nullptr;
- if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable"))
- collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
- PartialIVCondBranch, L, LI, AA, MSSAU);
+ collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+ PartialIVCondBranch, L, LI, AA, MSSAU);
if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable"))
collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
PartialIVCondBranch, L, DT, LI, AA,
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 44e63a0..b17dcb78 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -328,7 +328,7 @@ class StructurizeCFG {
void addPhiValues(BasicBlock *From, BasicBlock *To);
void findUndefBlocks(BasicBlock *PHIBlock,
- const SmallSet<BasicBlock *, 8> &Incomings,
+ const SmallPtrSet<BasicBlock *, 8> &Incomings,
SmallVector<BasicBlock *> &UndefBlks) const;
void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
@@ -762,7 +762,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
/// from some blocks as undefined. The function will find out all such blocks
/// and return in \p UndefBlks.
void StructurizeCFG::findUndefBlocks(
- BasicBlock *PHIBlock, const SmallSet<BasicBlock *, 8> &Incomings,
+ BasicBlock *PHIBlock, const SmallPtrSet<BasicBlock *, 8> &Incomings,
SmallVector<BasicBlock *> &UndefBlks) const {
// We may get a post-structured CFG like below:
//
@@ -788,7 +788,7 @@ void StructurizeCFG::findUndefBlocks(
// path N->F2->F3->B. For example, the threads take the branch F1->N may
// always take the branch F2->P2. So, when we are reconstructing a PHI
// originally in B, we can safely say the incoming value from N is undefined.
- SmallSet<BasicBlock *, 8> VisitedBlock;
+ SmallPtrSet<BasicBlock *, 8> VisitedBlock;
SmallVector<BasicBlock *, 8> Stack;
if (PHIBlock == ParentRegion->getExit()) {
for (auto P : predecessors(PHIBlock)) {
@@ -884,7 +884,7 @@ void StructurizeCFG::setPhiValues() {
PhiMap &BlkPhis = OldPhiIt->second;
SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
- SmallSet<BasicBlock *, 8> Incomings;
+ SmallPtrSet<BasicBlock *, 8> Incomings;
// Get the undefined blocks shared by all the phi nodes.
if (!BlkPhis.empty()) {
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index ddd203f3..42b1fdf 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -111,15 +111,14 @@ BasicBlock *
llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
const CriticalEdgeSplittingOptions &Options,
const Twine &BBName) {
- assert(!isa<IndirectBrInst>(TI) &&
- "Cannot split critical edge from IndirectBrInst");
-
BasicBlock *TIBB = TI->getParent();
BasicBlock *DestBB = TI->getSuccessor(SuccNum);
- // Splitting the critical edge to a pad block is non-trivial. Don't do
- // it in this generic function.
- if (DestBB->isEHPad()) return nullptr;
+ // Splitting the critical edge to a pad block is non-trivial.
+ // And we cannot split block with IndirectBr as a terminator.
+ // Don't do it in this generic function.
+ if (DestBB->isEHPad() || isa<IndirectBrInst>(TI))
+ return nullptr;
if (Options.IgnoreUnreachableDests &&
isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 40010ae..8044f61 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -193,7 +193,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() {
if (Candidates.empty())
return false;
- SmallSet<PHINode *, 8> ProcessedPHIs;
+ SmallPtrSet<PHINode *, 8> ProcessedPHIs;
for (const auto &Info : Candidates) {
PHINode *PHI = Info.PHI;
if (!ProcessedPHIs.insert(Info.PHI).second)
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7a9dd37..bbd1ed6 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1099,7 +1099,7 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
// Get the memory operand of the lifetime marker. If the underlying
// object is a sunk alloca, or is otherwise defined in the extraction
// region, the lifetime marker must not be erased.
- Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
+ Value *Mem = II->getOperand(0);
if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
continue;
@@ -1115,8 +1115,6 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
static void insertLifetimeMarkersSurroundingCall(
Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
CallInst *TheCall) {
- LLVMContext &Ctx = M->getContext();
- auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
Instruction *Term = TheCall->getParent()->getTerminator();
// Emit lifetime markers for the pointers given in \p Objects. Insert the
@@ -1130,7 +1128,7 @@ static void insertLifetimeMarkersSurroundingCall(
Function *Func =
Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType());
- auto Marker = CallInst::Create(Func, {NegativeOne, Mem});
+ auto Marker = CallInst::Create(Func, Mem);
if (InsertBefore)
Marker->insertBefore(TheCall->getIterator());
else
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 4b0065d..8954de6 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -276,7 +276,7 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
#ifndef NDEBUG
- SmallSet<BasicBlock *, 8> Incoming;
+ SmallPtrSet<BasicBlock *, 8> Incoming;
#endif
SetVector<BasicBlock *> Outgoing;
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 291e2a5..7063cde 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -706,6 +706,15 @@ bool llvm::checkDebugInfoMetadata(Module &M,
DILocsBefore, DILocsAfter, InstToDelete, NameOfWrappedPass,
FileNameFromCU, ShouldWriteIntoJSON, Bugs);
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
+ // If we are tracking DebugLoc coverage, replace each empty DebugLoc with an
+ // annotated location now so that it does not show up in future passes even if
+ // it is propagated to other instructions.
+ for (auto &L : DILocsAfter)
+ if (!L.second)
+ const_cast<Instruction *>(L.first)->setDebugLoc(DebugLoc::getUnknown());
+#endif
+
bool ResultForVars = checkVars(DIVarsBefore, DIVarsAfter, NameOfWrappedPass,
FileNameFromCU, ShouldWriteIntoJSON, Bugs);
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 540039b..0642d51 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -30,7 +30,7 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
FunctionType *FuncTy =
FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
- const char *FuncName = RTLCI.getLibcallImplName(Impl);
+ StringRef FuncName = RTLCI.getLibcallImplName(Impl);
M.getOrInsertFunction(FuncName, FuncTy);
}
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 59a47a9..f49fbf8 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -77,7 +77,6 @@
#include <cstdint>
#include <deque>
#include <iterator>
-#include <limits>
#include <optional>
#include <string>
#include <utility>
@@ -3004,31 +3003,11 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
if (hasLifetimeMarkers(AI))
continue;
- // Try to determine the size of the allocation.
- ConstantInt *AllocaSize = nullptr;
- if (ConstantInt *AIArraySize =
- dyn_cast<ConstantInt>(AI->getArraySize())) {
- auto &DL = Caller->getDataLayout();
- Type *AllocaType = AI->getAllocatedType();
- TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
- uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
-
- // Don't add markers for zero-sized allocas.
- if (AllocaArraySize == 0)
- continue;
-
- // Check that array size doesn't saturate uint64_t and doesn't
- // overflow when it's multiplied by type size.
- if (!AllocaTypeSize.isScalable() &&
- AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
- std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
- AllocaTypeSize.getFixedValue()) {
- AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
- AllocaArraySize * AllocaTypeSize);
- }
- }
+ std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
+ if (Size && Size->isZero())
+ continue;
- builder.CreateLifetimeStart(AI, AllocaSize);
+ builder.CreateLifetimeStart(AI);
for (ReturnInst *RI : Returns) {
// Don't insert llvm.lifetime.end calls between a musttail or deoptimize
// call and a return. The return kills all local allocas.
@@ -3038,7 +3017,7 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
if (InlinedDeoptimizeCalls &&
RI->getParent()->getTerminatingDeoptimizeCall())
continue;
- IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+ IRBuilder<>(RI).CreateLifetimeEnd(AI);
}
}
}
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2619e73..ac34490 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -275,7 +275,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
Builder.CreateBr(TheOnlyDest);
BasicBlock *BB = SI->getParent();
- SmallSet<BasicBlock *, 8> RemovedSuccessors;
+ SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
// Remove entries from PHI nodes which we no longer branch to...
BasicBlock *SuccToKeep = TheOnlyDest;
@@ -343,7 +343,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
if (auto *BA =
dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
BasicBlock *TheOnlyDest = BA->getBasicBlock();
- SmallSet<BasicBlock *, 8> RemovedSuccessors;
+ SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
// Insert the new branch.
Builder.CreateBr(TheOnlyDest);
@@ -481,7 +481,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
return true;
if (II->isLifetimeStartOrEnd()) {
- auto *Arg = II->getArgOperand(1);
+ auto *Arg = II->getArgOperand(0);
if (isa<PoisonValue>(Arg))
return true;
@@ -2518,7 +2518,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
if (MSSAU)
MSSAU->changeToUnreachable(I);
- SmallSet<BasicBlock *, 8> UniqueSuccessors;
+ SmallPtrSet<BasicBlock *, 8> UniqueSuccessors;
// Loop over all of the successors, removing BB's entry from any PHI
// nodes.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e7623aa..2d830f3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -914,6 +914,8 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
switch (RK) {
default:
llvm_unreachable("Unexpected recurrence kind");
+ case RecurKind::AddChainWithSubs:
+ case RecurKind::Sub:
case RecurKind::Add:
return Intrinsic::vector_reduce_add;
case RecurKind::Mul:
@@ -1301,6 +1303,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
Builder.getFastMathFlags());
};
switch (RdxKind) {
+ case RecurKind::AddChainWithSubs:
+ case RecurKind::Sub:
case RecurKind::Add:
case RecurKind::Mul:
case RecurKind::And:
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 472c03f..1f59b17 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -155,7 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
return;
}
if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) {
- AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+ AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0));
if (!AI ||
getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting)
return;
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index b22ecbc..978d5a2 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -20,7 +20,6 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -291,6 +290,11 @@ void PredicateInfoBuilder::convertUsesToDFSOrdered(
Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
for (auto &U : Op->uses()) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ // Lifetime intrinsics must work directly on alloca, do not replace them
+ // with a predicated copy.
+ if (I->isLifetimeStartOrEnd())
+ continue;
+
ValueDFS VD;
// Put the phi node uses in the incoming block.
BasicBlock *IBlock;
@@ -370,6 +374,8 @@ void PredicateInfoBuilder::processAssume(
Values.push_back(Cond);
if (auto *Cmp = dyn_cast<CmpInst>(Cond))
collectCmpOps(Cmp, Values);
+ else if (match(Cond, m_NUWTrunc(m_Value(Op0))))
+ Values.push_back(Op0);
for (Value *V : Values) {
if (shouldRename(V)) {
@@ -416,6 +422,8 @@ void PredicateInfoBuilder::processBranch(
Values.push_back(Cond);
if (auto *Cmp = dyn_cast<CmpInst>(Cond))
collectCmpOps(Cmp, Values);
+ else if (match(Cond, m_NUWTrunc(m_Value(Op0))))
+ Values.push_back(Op0);
for (Value *V : Values) {
if (shouldRename(V)) {
@@ -506,23 +514,10 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
? OrigOp
: (RenameStack.end() - Start - 1)->Def;
- auto CreateSSACopy = [this](IRBuilderBase &B, Value *Op,
- const Twine &Name = "") {
- auto It = PI.DeclarationCache.try_emplace(Op->getType());
- if (It.second) {
- // The number of named values is used to detect if a new declaration
- // was added. If so, that declaration is tracked so that it can be
- // removed when the analysis is done. The corner case were a new
- // declaration results in a name clash and the old name being renamed
- // is not considered as that represents an invalid module.
- auto NumDecls = F.getParent()->getNumNamedValues();
- Function *IF = Intrinsic::getOrInsertDeclaration(
- F.getParent(), Intrinsic::ssa_copy, Op->getType());
- if (NumDecls != F.getParent()->getNumNamedValues())
- PI.CreatedDeclarations.insert(IF);
- It.first->second = IF;
- }
- return B.CreateCall(It.first->second, Op, Name);
+ auto CreateSSACopy = [](Instruction *InsertPt, Value *Op,
+ const Twine &Name = "") {
+ // Use a no-op bitcast to represent ssa copy.
+ return new BitCastInst(Op, Op->getType(), Name, InsertPt->getIterator());
};
// For edge predicates, we can just place the operand in the block before
// the terminator. For assume, we have to place it right after the assume
@@ -530,9 +525,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
// right before the terminator or after the assume, so that we insert in
// proper order in the case of multiple predicateinfo in the same block.
if (isa<PredicateWithEdge>(ValInfo)) {
- IRBuilder<> B(getBranchTerminator(ValInfo));
- CallInst *PIC =
- CreateSSACopy(B, Op, Op->getName() + "." + Twine(Counter++));
+ BitCastInst *PIC = CreateSSACopy(getBranchTerminator(ValInfo), Op,
+ Op->getName() + "." + Twine(Counter++));
PI.PredicateMap.insert({PIC, ValInfo});
Result.Def = PIC;
} else {
@@ -541,8 +535,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
"Should not have gotten here without it being an assume");
// Insert the predicate directly after the assume. While it also holds
// directly before it, assume(i1 true) is not a useful fact.
- IRBuilder<> B(PAssume->AssumeInst->getNextNode());
- CallInst *PIC = CreateSSACopy(B, Op);
+ BitCastInst *PIC = CreateSSACopy(PAssume->AssumeInst->getNextNode(), Op);
PI.PredicateMap.insert({PIC, ValInfo});
Result.Def = PIC;
}
@@ -710,23 +703,6 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
Builder.buildPredicateInfo();
}
-// Remove all declarations we created . The PredicateInfo consumers are
-// responsible for remove the ssa_copy calls created.
-PredicateInfo::~PredicateInfo() {
- // Collect function pointers in set first, as SmallSet uses a SmallVector
- // internally and we have to remove the asserting value handles first.
- SmallPtrSet<Function *, 20> FunctionPtrs;
- for (const auto &F : CreatedDeclarations)
- FunctionPtrs.insert(&*F);
- CreatedDeclarations.clear();
-
- for (Function *F : FunctionPtrs) {
- assert(F->users().empty() &&
- "PredicateInfo consumer did not remove all SSA copies.");
- F->eraseFromParent();
- }
-}
-
std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
switch (Type) {
case PT_Assume:
@@ -741,6 +717,11 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
: ConstantInt::getFalse(Condition->getType())}};
}
+ if (match(Condition, m_NUWTrunc(m_Specific(RenamedOp)))) {
+ return {{TrueEdge ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ,
+ ConstantInt::getNullValue(RenamedOp->getType())}};
+ }
+
CmpInst *Cmp = dyn_cast<CmpInst>(Condition);
if (!Cmp) {
// TODO: Make this an assertion once RenamedOp is fully accurate.
@@ -779,15 +760,16 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const {
void PredicateInfo::verifyPredicateInfo() const {}
-// Replace ssa_copy calls created by PredicateInfo with their operand.
+// Replace bitcasts created by PredicateInfo with their operand.
static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) {
const auto *PI = PredInfo.getPredicateInfoFor(&Inst);
- auto *II = dyn_cast<IntrinsicInst>(&Inst);
- if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+ if (!PI)
continue;
- Inst.replaceAllUsesWith(II->getOperand(0));
+ assert(isa<BitCastInst>(Inst) &&
+ Inst.getType() == Inst.getOperand(0)->getType());
+ Inst.replaceAllUsesWith(Inst.getOperand(0));
Inst.eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
index 0ffea3f..41647f7 100644
--- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -8,10 +8,8 @@
#include "llvm/Transforms/Utils/ProfileVerify.h"
#include "llvm/ADT/DynamicAPInt.h"
-#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Analysis.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index d96f1d6..10c162b 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -136,7 +136,7 @@ public:
/// \p ToDelete that stores to this alloca.
void updateForDeletedStore(
StoreInst *ToDelete, DIBuilder &DIB,
- SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
+ SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
// There's nothing to do if the alloca doesn't have any variables using
// assignment tracking.
if (DVRAssigns.empty())
@@ -382,7 +382,7 @@ struct PromoteMem2Reg {
SmallVector<AssignmentTrackingInfo, 8> AllocaATInfo;
/// A set of dbg.assigns to delete because they've been demoted to
/// dbg.values. Call cleanUpDbgAssigns to delete them.
- SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
+ SmallPtrSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
/// The set of basic blocks the renamer has already visited.
BitVector Visited;
@@ -533,11 +533,10 @@ static void removeIntrinsicUsers(AllocaInst *AI) {
/// false there were some loads which were not dominated by the single store
/// and thus must be phi-ed with undef. We fall back to the standard alloca
/// promotion algorithm in that case.
-static bool
-rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
- const DataLayout &DL, DominatorTree &DT,
- AssumptionCache *AC,
- SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
+static bool rewriteSingleStoreAlloca(
+ AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL,
+ DominatorTree &DT, AssumptionCache *AC,
+ SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
StoreInst *OnlyStore = Info.OnlyStore;
Value *ReplVal = OnlyStore->getOperand(0);
// Loads may either load the stored value or uninitialized memory (undef).
@@ -647,11 +646,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
/// use(t);
/// *A = 42;
/// }
-static bool
-promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
- LargeBlockInfo &LBI, const DataLayout &DL,
- DominatorTree &DT, AssumptionCache *AC,
- SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
+static bool promoteSingleBlockAlloca(
+ AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI,
+ const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC,
+ SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
// The trickiest case to handle is when we have large blocks. Because of this,
// this code is optimized assuming that large blocks happen. This does not
// significantly pessimize the small block case. This uses LargeBlockInfo to
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b78c702..8448517 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -777,10 +777,10 @@ public:
for (BasicBlock &BB : F) {
for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
- if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+ if (auto *BC = dyn_cast<BitCastInst>(&Inst)) {
+ if (BC->getType() == BC->getOperand(0)->getType()) {
if (It->second->getPredicateInfoFor(&Inst)) {
- Value *Op = II->getOperand(0);
+ Value *Op = BC->getOperand(0);
Inst.replaceAllUsesWith(Op);
Inst.eraseFromParent();
}
@@ -1413,6 +1413,15 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
if (ValueState[&I].isOverdefined())
return;
+ if (auto *BC = dyn_cast<BitCastInst>(&I)) {
+ if (BC->getType() == BC->getOperand(0)->getType()) {
+ if (const PredicateBase *PI = getPredicateInfoFor(&I)) {
+ handlePredicate(&I, I.getOperand(0), PI);
+ return;
+ }
+ }
+ }
+
ValueLatticeElement OpSt = getValueState(I.getOperand(0));
if (OpSt.isUnknownOrUndef())
return;
@@ -1433,8 +1442,12 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false);
Type *DestTy = I.getDestTy();
- ConstantRange Res =
- OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits());
+ ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits());
+ if (auto *Trunc = dyn_cast<TruncInst>(&I))
+ Res = OpRange.truncate(DestTy->getScalarSizeInBits(),
+ Trunc->getNoWrapKind());
+ else
+ Res = OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits());
mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
} else
markOverdefined(&I);
@@ -2001,17 +2014,6 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
Function *F = CB.getCalledFunction();
if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
- if (ValueState[&CB].isOverdefined())
- return;
-
- Value *CopyOf = CB.getOperand(0);
- const PredicateBase *PI = getPredicateInfoFor(&CB);
- assert(PI && "Missing predicate info for ssa.copy");
- handlePredicate(&CB, CopyOf, PI);
- return;
- }
-
if (II->getIntrinsicID() == Intrinsic::vscale) {
unsigned BitWidth = CB.getType()->getScalarSizeInBits();
const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 1eb8996..e218db3 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1346,7 +1346,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
CanonicalIV->insertBefore(Header->begin());
rememberInstruction(CanonicalIV);
- SmallSet<BasicBlock *, 4> PredSeen;
+ SmallPtrSet<BasicBlock *, 4> PredSeen;
Constant *One = ConstantInt::get(Ty, 1);
for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
BasicBlock *HP = *HPI;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index deabacc..055e8ca 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -291,6 +291,7 @@ class SimplifyCFGOpt {
bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+ bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI);
bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
IRBuilder<> &Builder);
@@ -564,6 +565,9 @@ struct ConstantComparesGatherer {
/// Number of comparisons matched in the and/or chain
unsigned UsedICmps = 0;
+ /// If the elements in Vals matches the comparisons
+ bool IsEq = false;
+
/// Construct and compute the result for the comparison instruction Cond
ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
gather(Cond);
@@ -735,23 +739,23 @@ private:
/// vector.
/// One "Extra" case is allowed to differ from the other.
void gather(Value *V) {
- bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
-
+ Value *Op0, *Op1;
+ if (match(V, m_LogicalOr(m_Value(Op0), m_Value(Op1))))
+ IsEq = true;
+ else if (match(V, m_LogicalAnd(m_Value(Op0), m_Value(Op1))))
+ IsEq = false;
+ else
+ return;
// Keep a stack (SmallVector for efficiency) for depth-first traversal
- SmallVector<Value *, 8> DFT;
- SmallPtrSet<Value *, 8> Visited;
-
- // Initialize
- Visited.insert(V);
- DFT.push_back(V);
+ SmallVector<Value *, 8> DFT{Op0, Op1};
+ SmallPtrSet<Value *, 8> Visited{V, Op0, Op1};
while (!DFT.empty()) {
V = DFT.pop_back_val();
if (Instruction *I = dyn_cast<Instruction>(V)) {
// If it is a || (or && depending on isEQ), process the operands.
- Value *Op0, *Op1;
- if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
+ if (IsEq ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
: match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
if (Visited.insert(Op1).second)
DFT.push_back(Op1);
@@ -762,7 +766,7 @@ private:
}
// Try to match the current instruction
- if (matchInstruction(I, isEQ))
+ if (matchInstruction(I, IsEq))
// Match succeed, continue the loop
continue;
}
@@ -810,11 +814,15 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
CV = SI->getCondition();
} else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
- if (BI->isConditional() && BI->getCondition()->hasOneUse())
+ if (BI->isConditional() && BI->getCondition()->hasOneUse()) {
if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
if (ICI->isEquality() && getConstantInt(ICI->getOperand(1), DL))
CV = ICI->getOperand(0);
+ } else if (auto *Trunc = dyn_cast<TruncInst>(BI->getCondition())) {
+ if (Trunc->hasNoUnsignedWrap())
+ CV = Trunc->getOperand(0);
}
+ }
// Unwrap any lossless ptrtoint cast.
if (CV) {
@@ -840,11 +848,20 @@ BasicBlock *SimplifyCFGOpt::getValueEqualityComparisonCases(
}
BranchInst *BI = cast<BranchInst>(TI);
- ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
- BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
- Cases.push_back(ValueEqualityComparisonCase(
- getConstantInt(ICI->getOperand(1), DL), Succ));
- return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+ Value *Cond = BI->getCondition();
+ ICmpInst::Predicate Pred;
+ ConstantInt *C;
+ if (auto *ICI = dyn_cast<ICmpInst>(Cond)) {
+ Pred = ICI->getPredicate();
+ C = getConstantInt(ICI->getOperand(1), DL);
+ } else {
+ Pred = ICmpInst::ICMP_NE;
+ auto *Trunc = cast<TruncInst>(Cond);
+ C = ConstantInt::get(cast<IntegerType>(Trunc->getOperand(0)->getType()), 0);
+ }
+ BasicBlock *Succ = BI->getSuccessor(Pred == ICmpInst::ICMP_NE);
+ Cases.push_back(ValueEqualityComparisonCase(C, Succ));
+ return BI->getSuccessor(Pred == ICmpInst::ICMP_EQ);
}
/// Given a vector of bb/value pairs, remove any entries
@@ -1106,7 +1123,10 @@ static void getBranchWeights(Instruction *TI,
// default weight to be the first entry.
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
assert(Weights.size() == 2);
- ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+ auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!ICI)
+ return;
+
if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
std::swap(Weights.front(), Weights.back());
}
@@ -3321,12 +3341,10 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
// %merge = select %cond, %two, %one
// store %merge, %x.dest, !DIAssignID !2
// dbg.assign %merge, "x", ..., !2
- auto replaceVariable = [OrigV, S](auto *DbgAssign) {
+ for (DbgVariableRecord *DbgAssign :
+ at::getDVRAssignmentMarkers(SpeculatedStore))
if (llvm::is_contained(DbgAssign->location_ops(), OrigV))
DbgAssign->replaceVariableLocationOp(OrigV, S);
- };
- for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable);
- for_each(at::getDVRAssignmentMarkers(SpeculatedStore), replaceVariable);
}
// Metadata can be dependent on the condition we are hoisting above.
@@ -3655,15 +3673,19 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
return false;
}
-static bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
- DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(BranchInst *BI) {
+ // Note: If BB is a loop header then there is a risk that threading introduces
+ // a non-canonical loop by moving a back edge. So we avoid this optimization
+ // for loop headers if NeedCanonicalLoop is set.
+ if (Options.NeedCanonicalLoop && is_contained(LoopHeaders, BI->getParent()))
+ return false;
+
std::optional<bool> Result;
bool EverChanged = false;
do {
// Note that None means "we changed things, but recurse further."
- Result = foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+ Result =
+ foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, Options.AC);
EverChanged |= Result == std::nullopt || *Result;
} while (Result == std::nullopt);
return EverChanged;
@@ -5084,6 +5106,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
Value *CompVal = ConstantCompare.CompValue;
unsigned UsedICmps = ConstantCompare.UsedICmps;
Value *ExtraCase = ConstantCompare.Extra;
+ bool TrueWhenEqual = ConstantCompare.IsEq;
// If we didn't have a multiply compared value, fail.
if (!CompVal)
@@ -5093,8 +5116,6 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
if (UsedICmps <= 1)
return false;
- bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
-
// There might be duplicate constants in the list, which the switch
// instruction can't handle, remove them now.
array_pod_sort(Values.begin(), Values.end(), constantIntSortPredicate);
@@ -8085,7 +8106,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// If this is a branch on something for which we know the constant value in
// predecessors (e.g. a phi node in the current block), thread control
// through this block.
- if (foldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+ if (foldCondBranchOnValueKnownInPredecessor(BI))
return requestResimplify();
// Scan predecessor blocks for conditional branches.
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 737321d..2d6a748 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/DataLayout.h"
@@ -319,10 +320,10 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
} else if (isKnownNonZero(Size, DL)) {
annotateNonNullNoUndefBasedOnAccess(CI, ArgNos);
- const APInt *X, *Y;
+ uint64_t X, Y;
uint64_t DerefMin = 1;
- if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
- DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+ if (match(Size, m_Select(m_Value(), m_ConstantInt(X), m_ConstantInt(Y)))) {
+ DerefMin = std::min(X, Y);
annotateDereferenceableBytes(CI, ArgNos, DerefMin);
}
}
@@ -977,8 +978,14 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
// it's not very useful because calling strlen for a pointer of other types is
// very uncommon.
if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
- // TODO: Handle subobjects.
- if (!isGEPBasedOnPointerToString(GEP, CharSize))
+ unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+ SmallMapVector<Value *, APInt, 4> VarOffsets;
+ APInt ConstOffset(BW, 0);
+ assert(CharSize % 8 == 0 && "Expected a multiple of 8 sized CharSize");
+ // Check the gep is a single variable offset.
+ if (!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+ VarOffsets.size() != 1 || ConstOffset != 0 ||
+ VarOffsets.begin()->second != CharSize / 8)
return nullptr;
ConstantDataArraySlice Slice;
@@ -1000,10 +1007,8 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
return nullptr;
}
- Value *Offset = GEP->getOperand(2);
+ Value *Offset = VarOffsets.begin()->first;
KnownBits Known = computeKnownBits(Offset, DL, nullptr, CI, nullptr);
- uint64_t ArrSize =
- cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
// If Offset is not provably in the range [0, NullTermIdx], we can still
// optimize if we can prove that the program has undefined behavior when
@@ -1011,7 +1016,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
// is a pointer to an object whose memory extent is NullTermIdx+1.
if ((Known.isNonNegative() && Known.getMaxValue().ule(NullTermIdx)) ||
(isa<GlobalVariable>(GEP->getOperand(0)) &&
- NullTermIdx == ArrSize - 1)) {
+ NullTermIdx == Slice.Length - 1)) {
Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
Offset);
diff --git a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
index 6b18ece..c3ac39e 100644
--- a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp
@@ -12,7 +12,6 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
@@ -21,7 +20,6 @@
#include "llvm/Transforms/Utils/Cloning.h"
#include <map>
-#include <string>
#include <utility>
using namespace llvm;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index c47fd942..789047a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -793,280 +793,296 @@ static bool canWidenCallReturnType(Type *Ty) {
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
- BasicBlock *Header = TheLoop->getHeader();
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ bool Result = true;
// For each block in the loop.
for (BasicBlock *BB : TheLoop->blocks()) {
// Scan the instructions in the block and look for hazards.
for (Instruction &I : *BB) {
- if (auto *Phi = dyn_cast<PHINode>(&I)) {
- Type *PhiTy = Phi->getType();
- // Check that this PHI type is allowed.
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy()) {
- reportVectorizationFailure("Found a non-int non-pointer PHI",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- return false;
- }
+ Result &= canVectorizeInstr(I);
+ if (!DoExtraAnalysis && !Result)
+ return false;
+ }
+ }
- // If this PHINode is not in the header block, then we know that we
- // can convert it to select during if-conversion. No need to check if
- // the PHIs in this block are induction or reduction variables.
- if (BB != Header) {
- // Non-header phi nodes that have outside uses can be vectorized. Add
- // them to the list of allowed exits.
- // Unsafe cyclic dependencies with header phis are identified during
- // legalization for reduction, induction and fixed order
- // recurrences.
- AllowedExit.insert(&I);
- continue;
- }
+ if (!PrimaryInduction) {
+ if (Inductions.empty()) {
+ reportVectorizationFailure(
+ "Did not find one integer induction var",
+ "loop induction variable could not be identified",
+ "NoInductionVariable", ORE, TheLoop);
+ return false;
+ }
+ if (!WidestIndTy) {
+ reportVectorizationFailure(
+ "Did not find one integer induction var",
+ "integer loop induction variable could not be identified",
+ "NoIntegerInductionVariable", ORE, TheLoop);
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ }
- // We only allow if-converted PHIs with exactly two incoming values.
- if (Phi->getNumIncomingValues() != 2) {
- reportVectorizationFailure("Found an invalid PHI",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop, Phi);
- return false;
- }
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+ PrimaryInduction = nullptr;
- RecurrenceDescriptor RedDes;
- if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
- DT, PSE.getSE())) {
- Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
- AllowedExit.insert(RedDes.getLoopExitInstr());
- Reductions[Phi] = RedDes;
- continue;
- }
+ return Result;
+}
- // We prevent matching non-constant strided pointer IVS to preserve
- // historical vectorizer behavior after a generalization of the
- // IVDescriptor code. The intent is to remove this check, but we
- // have to fix issues around code quality for such loops first.
- auto IsDisallowedStridedPointerInduction =
- [](const InductionDescriptor &ID) {
- if (AllowStridedPointerIVs)
- return false;
- return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
- ID.getConstIntStepValue() == nullptr;
- };
-
- // TODO: Instead of recording the AllowedExit, it would be good to
- // record the complementary set: NotAllowedExit. These include (but may
- // not be limited to):
- // 1. Reduction phis as they represent the one-before-last value, which
- // is not available when vectorized
- // 2. Induction phis and increment when SCEV predicates cannot be used
- // outside the loop - see addInductionPhi
- // 3. Non-Phis with outside uses when SCEV predicates cannot be used
- // outside the loop - see call to hasOutsideLoopUser in the non-phi
- // handling below
- // 4. FixedOrderRecurrence phis that can possibly be handled by
- // extraction.
- // By recording these, we can then reason about ways to vectorize each
- // of these NotAllowedExit.
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
- !IsDisallowedStridedPointerInduction(ID)) {
- addInductionPhi(Phi, ID, AllowedExit);
- Requirements->addExactFPMathInst(ID.getExactFPMathInst());
- continue;
- }
+bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
+ BasicBlock *BB = I.getParent();
+ BasicBlock *Header = TheLoop->getHeader();
- if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
- AllowedExit.insert(Phi);
- FixedOrderRecurrences.insert(Phi);
- continue;
- }
+ if (auto *Phi = dyn_cast<PHINode>(&I)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ reportVectorizationFailure(
+ "Found a non-int non-pointer PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ return false;
+ }
- // As a last resort, coerce the PHI to a AddRec expression
- // and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
- !IsDisallowedStridedPointerInduction(ID)) {
- addInductionPhi(Phi, ID, AllowedExit);
- continue;
- }
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (BB != Header) {
+ // Non-header phi nodes that have outside uses can be vectorized. Add
+ // them to the list of allowed exits.
+ // Unsafe cyclic dependencies with header phis are identified during
+ // legalization for reduction, induction and fixed order
+ // recurrences.
+ AllowedExit.insert(&I);
+ return true;
+ }
- reportVectorizationFailure("Found an unidentified PHI",
- "value that could not be identified as "
- "reduction is used outside the loop",
- "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
- return false;
- } // end of PHI handling
-
- // We handle calls that:
- // * Have a mapping to an IR intrinsic.
- // * Have a vector version available.
- auto *CI = dyn_cast<CallInst>(&I);
-
- if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
- !(CI->getCalledFunction() && TLI &&
- (!VFDatabase::getMappings(*CI).empty() ||
- isTLIScalarize(*TLI, *CI)))) {
- // If the call is a recognized math libary call, it is likely that
- // we can vectorize it given loosened floating-point constraints.
- LibFunc Func;
- bool IsMathLibCall =
- TLI && CI->getCalledFunction() &&
- CI->getType()->isFloatingPointTy() &&
- TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
- TLI->hasOptimizedCodeGen(Func);
-
- if (IsMathLibCall) {
- // TODO: Ideally, we should not use clang-specific language here,
- // but it's hard to provide meaningful yet generic advice.
- // Also, should this be guarded by allowExtraAnalysis() and/or be part
- // of the returned info from isFunctionVectorizable()?
- reportVectorizationFailure(
- "Found a non-intrinsic callsite",
- "library call cannot be vectorized. "
- "Try compiling with -fno-math-errno, -ffast-math, "
- "or similar flags",
- "CantVectorizeLibcall", ORE, TheLoop, CI);
- } else {
- reportVectorizationFailure("Found a non-intrinsic callsite",
- "call instruction cannot be vectorized",
- "CantVectorizeLibcall", ORE, TheLoop, CI);
- }
- return false;
- }
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ reportVectorizationFailure(
+ "Found an invalid PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop, Phi);
+ return false;
+ }
- // Some intrinsics have scalar arguments and should be same in order for
- // them to be vectorized (i.e. loop invariant).
- if (CI) {
- auto *SE = PSE.getSE();
- Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
- if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
- if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
- TheLoop)) {
- reportVectorizationFailure("Found unvectorizable intrinsic",
- "intrinsic instruction cannot be vectorized",
- "CantVectorizeIntrinsic", ORE, TheLoop, CI);
- return false;
- }
- }
- }
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
+ PSE.getSE())) {
+ Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ return true;
+ }
- // If we found a vectorized variant of a function, note that so LV can
- // make better decisions about maximum VF.
- if (CI && !VFDatabase::getMappings(*CI).empty())
- VecCallVariantsFound = true;
-
- auto CanWidenInstructionTy = [](Instruction const &Inst) {
- Type *InstTy = Inst.getType();
- if (!isa<StructType>(InstTy))
- return canVectorizeTy(InstTy);
-
- // For now, we only recognize struct values returned from calls where
- // all users are extractvalue as vectorizable. All element types of the
- // struct must be types that can be widened.
- return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
- all_of(Inst.users(), IsaPred<ExtractValueInst>);
- };
+ // We prevent matching non-constant strided pointer IVS to preserve
+ // historical vectorizer behavior after a generalization of the
+ // IVDescriptor code. The intent is to remove this check, but we
+ // have to fix issues around code quality for such loops first.
+ auto IsDisallowedStridedPointerInduction =
+ [](const InductionDescriptor &ID) {
+ if (AllowStridedPointerIVs)
+ return false;
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ ID.getConstIntStepValue() == nullptr;
+ };
+
+ // TODO: Instead of recording the AllowedExit, it would be good to
+ // record the complementary set: NotAllowedExit. These include (but may
+ // not be limited to):
+ // 1. Reduction phis as they represent the one-before-last value, which
+ // is not available when vectorized
+ // 2. Induction phis and increment when SCEV predicates cannot be used
+ // outside the loop - see addInductionPhi
+ // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+ // outside the loop - see call to hasOutsideLoopUser in the non-phi
+ // handling below
+ // 4. FixedOrderRecurrence phis that can possibly be handled by
+ // extraction.
+ // By recording these, we can then reason about ways to vectorize each
+ // of these NotAllowedExit.
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
+ !IsDisallowedStridedPointerInduction(ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ Requirements->addExactFPMathInst(ID.getExactFPMathInst());
+ return true;
+ }
- // Check that the instruction return type is vectorizable.
- // We can't vectorize casts from vector type to scalar type.
- // Also, we can't vectorize extractelement instructions.
- if (!CanWidenInstructionTy(I) ||
- (isa<CastInst>(I) &&
- !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
- isa<ExtractElementInst>(I)) {
- reportVectorizationFailure("Found unvectorizable type",
- "instruction return type cannot be vectorized",
- "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
- return false;
- }
+ if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
+ AllowedExit.insert(Phi);
+ FixedOrderRecurrences.insert(Phi);
+ return true;
+ }
+
+ // As a last resort, coerce the PHI to a AddRec expression
+ // and re-try classifying it a an induction PHI.
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
+ !IsDisallowedStridedPointerInduction(ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ return true;
+ }
- // Check that the stored type is vectorizable.
- if (auto *ST = dyn_cast<StoreInst>(&I)) {
- Type *T = ST->getValueOperand()->getType();
- if (!VectorType::isValidElementType(T)) {
- reportVectorizationFailure("Store instruction cannot be vectorized",
- "CantVectorizeStore", ORE, TheLoop, ST);
+ reportVectorizationFailure("Found an unidentified PHI",
+ "value that could not be identified as "
+ "reduction is used outside the loop",
+ "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
+ Phi);
+ return false;
+ } // end of PHI handling
+
+ // We handle calls that:
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ auto *CI = dyn_cast<CallInst>(&I);
+
+ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+ !(CI->getCalledFunction() && TLI &&
+ (!VFDatabase::getMappings(*CI).empty() || isTLIScalarize(*TLI, *CI)))) {
+ // If the call is a recognized math libary call, it is likely that
+ // we can vectorize it given loosened floating-point constraints.
+ LibFunc Func;
+ bool IsMathLibCall =
+ TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
+ TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+ TLI->hasOptimizedCodeGen(Func);
+
+ if (IsMathLibCall) {
+ // TODO: Ideally, we should not use clang-specific language here,
+ // but it's hard to provide meaningful yet generic advice.
+ // Also, should this be guarded by allowExtraAnalysis() and/or be part
+ // of the returned info from isFunctionVectorizable()?
+ reportVectorizationFailure(
+ "Found a non-intrinsic callsite",
+ "library call cannot be vectorized. "
+ "Try compiling with -fno-math-errno, -ffast-math, "
+ "or similar flags",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ } else {
+ reportVectorizationFailure("Found a non-intrinsic callsite",
+ "call instruction cannot be vectorized",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ }
+ return false;
+ }
+
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized (i.e. loop invariant).
+ if (CI) {
+ auto *SE = PSE.getSE();
+ Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) {
+ reportVectorizationFailure(
+ "Found unvectorizable intrinsic",
+ "intrinsic instruction cannot be vectorized",
+ "CantVectorizeIntrinsic", ORE, TheLoop, CI);
return false;
}
+ }
+ }
- // For nontemporal stores, check that a nontemporal vector version is
- // supported on the target.
- if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
- // Arbitrarily try a vector of 2 elements.
- auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
- assert(VecTy && "did not find vectorized version of stored type");
- if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
- reportVectorizationFailure(
- "nontemporal store instruction cannot be vectorized",
- "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
- return false;
- }
- }
+ // If we found a vectorized variant of a function, note that so LV can
+ // make better decisions about maximum VF.
+ if (CI && !VFDatabase::getMappings(*CI).empty())
+ VecCallVariantsFound = true;
+
+ auto CanWidenInstructionTy = [](Instruction const &Inst) {
+ Type *InstTy = Inst.getType();
+ if (!isa<StructType>(InstTy))
+ return canVectorizeTy(InstTy);
+
+ // For now, we only recognize struct values returned from calls where
+ // all users are extractvalue as vectorizable. All element types of the
+ // struct must be types that can be widened.
+ return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
+ all_of(Inst.users(), IsaPred<ExtractValueInst>);
+ };
- } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
- if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
- // For nontemporal loads, check that a nontemporal vector version is
- // supported on the target (arbitrarily try a vector of 2 elements).
- auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
- assert(VecTy && "did not find vectorized version of load type");
- if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
- reportVectorizationFailure(
- "nontemporal load instruction cannot be vectorized",
- "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
- return false;
- }
- }
+ // Check that the instruction return type is vectorizable.
+ // We can't vectorize casts from vector type to scalar type.
+ // Also, we can't vectorize extractelement instructions.
+ if (!CanWidenInstructionTy(I) ||
+ (isa<CastInst>(I) &&
+ !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
+ isa<ExtractElementInst>(I)) {
+ reportVectorizationFailure("Found unvectorizable type",
+ "instruction return type cannot be vectorized",
+ "CantVectorizeInstructionReturnType", ORE,
+ TheLoop, &I);
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ reportVectorizationFailure("Store instruction cannot be vectorized",
+ "CantVectorizeStore", ORE, TheLoop, ST);
+ return false;
+ }
- // FP instructions can allow unsafe algebra, thus vectorizable by
- // non-IEEE-754 compliant SIMD units.
- // This applies to floating-point math operations and calls, not memory
- // operations, shuffles, or casts, as they don't change precision or
- // semantics.
- } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.isFast()) {
- LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
- Hints->setPotentiallyUnsafe();
+ // For nontemporal stores, check that a nontemporal vector version is
+ // supported on the target.
+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+ // Arbitrarily try a vector of 2 elements.
+ auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
+ assert(VecTy && "did not find vectorized version of stored type");
+ if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
+ reportVectorizationFailure(
+ "nontemporal store instruction cannot be vectorized",
+ "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
+ return false;
}
+ }
- // Reduction instructions are allowed to have exit users.
- // All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
- // We can safely vectorize loops where instructions within the loop are
- // used outside the loop only if the SCEV predicates within the loop is
- // same as outside the loop. Allowing the exit means reusing the SCEV
- // outside the loop.
- if (PSE.getPredicate().isAlwaysTrue()) {
- AllowedExit.insert(&I);
- continue;
- }
- reportVectorizationFailure("Value cannot be used outside the loop",
- "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+ // For nontemporal loads, check that a nontemporal vector version is
+ // supported on the target (arbitrarily try a vector of 2 elements).
+ auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
+ assert(VecTy && "did not find vectorized version of load type");
+ if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
+ reportVectorizationFailure(
+ "nontemporal load instruction cannot be vectorized",
+ "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
return false;
}
- } // next instr.
+ }
+
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+ !I.isFast()) {
+ LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
}
- if (!PrimaryInduction) {
- if (Inductions.empty()) {
- reportVectorizationFailure("Did not find one integer induction var",
- "loop induction variable could not be identified",
- "NoInductionVariable", ORE, TheLoop);
- return false;
- }
- if (!WidestIndTy) {
- reportVectorizationFailure("Did not find one integer induction var",
- "integer loop induction variable could not be identified",
- "NoIntegerInductionVariable", ORE, TheLoop);
- return false;
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ // We can safely vectorize loops where instructions within the loop are
+ // used outside the loop only if the SCEV predicates within the loop is
+ // same as outside the loop. Allowing the exit means reusing the SCEV
+ // outside the loop.
+ if (PSE.getPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(&I);
+ return true;
}
- LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ reportVectorizationFailure("Value cannot be used outside the loop",
+ "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+ return false;
}
- // Now we know the widest induction type, check if our found induction
- // is the same size. If it's not, unset it here and InnerLoopVectorizer
- // will create another.
- if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
- PrimaryInduction = nullptr;
-
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 912c893..838476d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -256,13 +256,15 @@ public:
new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
GEPNoWrapFlags::none(), DL, Name));
}
- VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
- DebugLoc DL = DebugLoc::getUnknown(),
- const Twine &Name = "") {
- return tryInsertInstruction(
- new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
- GEPNoWrapFlags::inBounds(), DL, Name));
+
+ VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
+ GEPNoWrapFlags GEPFlags,
+ DebugLoc DL = DebugLoc::getUnknown(),
+ const Twine &Name = "") {
+ return tryInsertInstruction(new VPInstruction(
+ VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
}
+
VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
@@ -276,6 +278,20 @@ public:
return tryInsertInstruction(new VPPhi(IncomingValues, DL, Name));
}
+ VPValue *createElementCount(Type *Ty, ElementCount EC) {
+ VPlan &Plan = *getInsertBlock()->getPlan();
+ VPValue *RuntimeEC =
+ Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+ if (EC.isScalable()) {
+ VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
+ RuntimeEC = EC.getKnownMinValue() == 1
+ ? VScale
+ : createOverflowingOp(Instruction::Mul,
+ {VScale, RuntimeEC}, {true, false});
+ }
+ return RuntimeEC;
+ }
+
/// Convert the input value \p Current to the corresponding value of an
/// induction with \p Start and \p Step values, using \p Start + \p Current *
/// \p Step.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index be00fd6..70f8840 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -499,19 +499,18 @@ class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
- const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+ ElementCount VecWidth,
ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
GeneratedRTChecks &RTChecks, VPlan &Plan)
- : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
- AC(AC), ORE(ORE), VF(VecWidth),
- MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
- Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
- RTChecks(RTChecks), Plan(Plan),
- VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
+ VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
+ UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
+ BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VectorPHVPBB(cast<VPBasicBlock>(
+ Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
@@ -548,9 +547,6 @@ public:
protected:
friend class LoopVectorizationPlanner;
- /// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
-
// Create a check to see if the vector loop should be executed
Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
@@ -586,18 +582,12 @@ protected:
/// Dominator Tree.
DominatorTree *DT;
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
/// Target Transform Info.
const TargetTransformInfo *TTI;
/// Assumption Cache.
AssumptionCache *AC;
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
ElementCount VF;
@@ -619,9 +609,6 @@ protected:
/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader = nullptr;
- /// Middle Block between the vector and the scalar.
- BasicBlock *LoopMiddleBlock = nullptr;
-
/// Trip count of the original loop.
Value *TripCount = nullptr;
@@ -648,7 +635,7 @@ protected:
/// The vector preheader block of \p Plan, used as target for check blocks
/// introduced during skeleton creation.
- VPBlockBase *VectorPHVPB;
+ VPBasicBlock *VectorPHVPBB;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -686,14 +673,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
public:
InnerLoopAndEpilogueVectorizer(
Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
- DominatorTree *DT, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
- LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
- BFI, PSI, Checks, Plan),
+ DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+ EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
+ ElementCount MinProfitableTripCount, unsigned UnrollFactor)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
+ MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
+ Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -721,15 +708,17 @@ public:
/// epilogues.
class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
public:
- EpilogueVectorizerMainLoop(
- Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
- DominatorTree *DT, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
- LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
- : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Check, Plan) {}
+ EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetTransformInfo *TTI,
+ AssumptionCache *AC,
+ EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Check, VPlan &Plan)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+ BFI, PSI, Check, Plan, EPI.MainLoopVF,
+ EPI.MainLoopVF, EPI.MainLoopUF) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
@@ -750,13 +739,13 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
public:
EpilogueVectorizerEpilogueLoop(
Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
- DominatorTree *DT, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
- LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
- : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Checks, Plan) {
+ DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+ EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Checks, VPlan &Plan)
+ : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+ BFI, PSI, Checks, Plan, EPI.EpilogueVF,
+ EPI.EpilogueVF, EPI.EpilogueUF) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -835,7 +824,14 @@ namespace llvm {
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
- return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
+ ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
+ assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
+ if (VF.isScalable() && isPowerOf2_64(Step)) {
+ return B.CreateShl(
+ B.CreateVScale(Ty),
+ ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
+ }
+ return B.CreateElementCount(Ty, VFxStep);
}
/// Return the runtime value for VF.
@@ -2272,65 +2268,15 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
- if (VectorTripCount)
- return VectorTripCount;
-
- Value *TC = getTripCount();
- IRBuilder<> Builder(InsertBlock->getTerminator());
-
- Type *Ty = TC->getType();
- // This is where we can make the step a runtime constant.
- Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
- // If the tail is to be folded by masking, round the number of iterations N
- // up to a multiple of Step instead of rounding down. This is done by first
- // adding Step-1 and then rounding down. Note that it's ok if this addition
- // overflows: the vector induction variable will eventually wrap to zero given
- // that it starts at zero and its Step is a power of two; the loop will then
- // exit, with the last early-exit vector comparison also producing all-true.
- // For scalable vectors the VF is not guaranteed to be a power of 2, but this
- // is accounted for in emitIterationCountCheck that adds an overflow check.
- if (Cost->foldTailByMasking()) {
- assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
- "VF*UF must be a power of 2 when folding tail by masking");
- TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
- "n.rnd.up");
- }
-
- // Now we need to generate the expression for the part of the loop that the
- // vectorized body will execute. This is equal to N - (N % Step) if scalar
- // iterations are not required for correctness, or N - Step, otherwise. Step
- // is equal to the vectorization factor (number of SIMD elements) times the
- // unroll factor (number of SIMD instructions).
- Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
- // There are cases where we *must* run at least one iteration in the remainder
- // loop. See the cost model for when this can happen. If the step evenly
- // divides the trip count, we set the remainder to be equal to the step. If
- // the step does not evenly divide the trip count, no adjustment is necessary
- // since there will already be scalar iterations. Note that the minimum
- // iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
- auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
- R = Builder.CreateSelect(IsZero, Step, R);
- }
-
- VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
- return VectorTripCount;
-}
-
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
- VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
+ VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
- VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
PreVectorPH = CheckVPIRBB;
VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
PreVectorPH->swapSuccessors();
@@ -2359,7 +2305,10 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- IRBuilder<> Builder(TCCheckBlock->getTerminator());
+ IRBuilder<InstSimplifyFolder> Builder(
+ TCCheckBlock->getContext(),
+ InstSimplifyFolder(TCCheckBlock->getDataLayout()));
+ Builder.SetInsertPoint(TCCheckBlock->getTerminator());
// If tail is to be folded, vector loop takes care of all iterations.
Value *Count = getTripCount();
@@ -2371,7 +2320,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return createStepForVF(Builder, CountTy, VF, UF);
Value *MinProfTC =
- createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+ Builder.CreateElementCount(CountTy, MinProfitableTripCount);
if (!VF.isScalable())
return MinProfTC;
return Builder.CreateBinaryIntrinsic(
@@ -2437,16 +2386,20 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
+ BasicBlock *IRBB) {
VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
- for (auto &R : make_early_inc_range(*VPBB)) {
- assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
- "Tried to move phi recipe after a non-phi recipe");
+ auto IP = IRVPBB->begin();
+ for (auto &R : make_early_inc_range(VPBB->phis()))
+ R.moveBefore(*IRVPBB, IP);
+
+ for (auto &R :
+ make_early_inc_range(make_range(VPBB->getFirstNonPhi(), VPBB->end())))
R.moveBefore(*IRVPBB, IRVPBB->end());
- }
VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
// VPBB is now dead and will be cleaned up when the plan gets destroyed.
+ return IRVPBB;
}
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
@@ -2549,7 +2502,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// to the scalar loop.
emitIterationCountCheck(LoopScalarPreHeader);
- replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
+ replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
return LoopVectorPreHeader;
}
@@ -2680,19 +2633,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
fixNonInductionPHIs(State);
- // After vectorization, the exit blocks of the original loop will have
- // additional predecessors. Invalidate SCEVs for the exit phis in case SE
- // looked through single-entry phis.
- SmallVector<BasicBlock *> ExitBlocks;
- OrigLoop->getExitBlocks(ExitBlocks);
- for (BasicBlock *Exit : ExitBlocks)
- for (PHINode &PN : Exit->phis())
- PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
-
- // Forget the original basic block.
- PSE.getSE()->forgetLoop(OrigLoop);
- PSE.getSE()->forgetBlockAndLoopDispositions();
-
// Don't apply optimizations below when no (vector) loop remains, as they all
// require one at the moment.
VPBasicBlock *HeaderVPBB =
@@ -2734,11 +2674,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
// Make sure the builder has a valid insert point.
Builder.SetInsertPoint(NewPhi);
- for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
- VPValue *Inc = VPPhi->getIncomingValue(Idx);
- const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
+ for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
- }
}
}
}
@@ -3158,6 +3095,12 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (Group->isReverse())
return false;
+ // TODO: Support interleaved access that requires a gap mask for scalable VFs.
+ bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
+ StoreAccessWithGapsRequiresMasking;
+ if (VF.isScalable() && NeedsMaskForGaps)
+ return false;
+
auto *Ty = getLoadStoreType(I);
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -4069,8 +4012,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
- CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4178,7 +4120,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
const TargetTransformInfo &TTI) {
assert(VF.isVector() && "Checking a scalar VF?");
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPTypeAnalysis TypeInfo(Plan);
DenseSet<VPRecipeBase *> EphemeralRecipes;
collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
// Set of already visited types.
@@ -4326,8 +4268,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
- CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -5272,8 +5213,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
- InstructionCost Cost =
- VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+ InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
+ PtrTy, SE, PtrSCEV, CostKind);
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
@@ -5344,11 +5285,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
assert(Legal->isUniformMemOp(*I, VF));
Type *ValTy = getLoadStoreType(I);
+ Type *PtrTy = getLoadStorePointerOperand(I)->getType();
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
if (isa<LoadInst>(I)) {
- return TTI.getAddressComputationCost(ValTy) +
+ return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
@@ -5361,13 +5303,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
// the actual generated code, which involves extracting the last element of
// a scalable vector where the lane to extract is unknown at compile time.
- return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
- CostKind) +
- (IsLoopInvariantStoreValue
- ? 0
- : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- CostKind, VF.getKnownMinValue() - 1));
+ InstructionCost Cost =
+ TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
+ if (!IsLoopInvariantStoreValue)
+ Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+ VectorTy, CostKind, 0);
+ return Cost;
}
InstructionCost
@@ -5377,8 +5319,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
const Value *Ptr = getLoadStorePointerOperand(I);
+ Type *PtrTy = toVectorTy(Ptr->getType(), VF);
- return TTI.getAddressComputationCost(VectorTy) +
+ return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
Legal->isMaskRequired(I), Alignment,
CostKind, I);
@@ -5613,11 +5556,12 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// moment.
if (VF.isScalar()) {
Type *ValTy = getLoadStoreType(I);
+ Type *PtrTy = getLoadStorePointerOperand(I)->getType();
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
- return TTI.getAddressComputationCost(ValTy) +
+ return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
OpInfo, I);
}
@@ -6976,8 +6920,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
- CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7178,8 +7121,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
- CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
@@ -7317,10 +7259,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
++LoopsEarlyExitVectorized;
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
- VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
- OrigLoop->getHeader()->getContext());
- VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
+ VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
+ VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
+ VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
bool HasBranchWeights =
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
if (HasBranchWeights) {
@@ -7339,21 +7281,25 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
- VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
+ VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
- VPlanTransforms::convertToConcreteRecipes(BestVPlan,
- *Legal->getWidestInductionType());
+ VPlanTransforms::convertToConcreteRecipes(BestVPlan);
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
+ VPlanTransforms::materializeVectorTripCount(
+ BestVPlan, VectorPH, CM.foldTailByMasking(),
+ CM.requiresScalarEpilogue(BestVF.isVector()));
+ VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
+ VPlanTransforms::simplifyRecipes(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7393,12 +7339,28 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BasicBlock *EntryBB =
cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- if (VectorizingEpilogue)
- VPlanTransforms::removeDeadRecipes(BestVPlan);
+ replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
+ State.CFG.PrevBB->getSingleSuccessor());
+ VPlanTransforms::removeDeadRecipes(BestVPlan);
assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
"final VPlan is invalid");
+ // After vectorization, the exit blocks of the original loop will have
+ // additional predecessors. Invalidate SCEVs for the exit phis in case SE
+ // looked through single-entry phis.
+ ScalarEvolution &SE = *PSE.getSE();
+ for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
+ if (Exit->getNumPredecessors() == 0)
+ continue;
+ for (VPRecipeBase &PhiR : Exit->phis())
+ SE.forgetLcssaPhiWithNewPredecessor(
+ OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
+ }
+ // Forget the original loop and block dispositions.
+ SE.forgetLoop(OrigLoop);
+ SE.forgetBlockAndLoopDispositions();
+
ILV.printDebugTracesAtStart();
//===------------------------------------------------===//
@@ -7409,11 +7371,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//
//===------------------------------------------------===//
- // 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(
- ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
- replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
-
// Move check blocks to their final position.
// TODO: Move as part of VPIRBB execute and update impacted tests.
if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
@@ -7530,7 +7487,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.MainLoopIterationCountCheck =
emitIterationCountCheck(LoopScalarPreHeader, false);
- replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
return LoopVectorPreHeader;
}
@@ -7557,8 +7513,9 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
assert(Bypass && "Expected valid bypass basic block.");
Value *Count = getTripCount();
MinProfitableTripCount = ElementCount::getFixed(0);
- Value *CheckMinIters = createIterationCountCheck(
- ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
+ Value *CheckMinIters =
+ createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+ ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
if (!ForEpilogue)
@@ -7568,12 +7525,13 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
static_cast<DominatorTree *>(nullptr), LI,
nullptr, "vector.ph");
-
if (ForEpilogue) {
// Save the trip count so we don't have to regenerate it in the
// vec.epilog.iter.check. This is safe to do because the trip count
// generated here dominates the vector epilog iter check.
EPI.TripCount = Count;
+ } else {
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
}
BranchInst &BI =
@@ -7607,6 +7565,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
BasicBlock *VecEpilogueIterationCountCheck =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
nullptr, "vec.epilog.iter.check", true);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
AdditionalBypassBlock = VecEpilogueIterationCountCheck;
@@ -7661,7 +7621,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
Phi->removeIncomingValue(MemCheckBlock);
}
- replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
return LoopVectorPreHeader;
}
@@ -7690,11 +7649,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
- // think the MainLoopStep is correct.
- unsigned MainLoopStep = UF * VF.getKnownMinValue();
+ auto VScale = Cost->getVScaleForTuning();
+ unsigned MainLoopStep =
+ estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
unsigned EpilogueLoopStep =
- EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+ estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
// We assume the remaining `Count` is equally distributed in
// [0, MainLoopStep)
// So the probability for `Count < EpilogueLoopStep` should be
@@ -8159,7 +8118,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
// extends are intended to be lowered along with the reduction itself.
// Build up a set of partial reduction ops for efficient use checking.
- SmallSet<User *, 4> PartialReductionOps;
+ SmallPtrSet<User *, 4> PartialReductionOps;
for (const auto &[PartialRdx, _] : PartialReductionChains)
PartialReductionOps.insert(PartialRdx.ExtendUser);
@@ -8435,8 +8394,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
LVer.prepareNoAliasMetadata();
}
+ // Create initial base VPlan0, to serve as common starting point for all
+ // candidates built later for specific VF ranges.
+ auto VPlan0 = VPlanTransforms::buildVPlan0(
+ OrigLoop, *LI, Legal->getWidestInductionType(),
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+
auto MaxVFTimes2 = MaxVF * 2;
- auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
VFRange SubRange = {VF, MaxVFTimes2};
if (auto Plan = tryToBuildVPlanWithVPRecipes(
@@ -8500,7 +8464,7 @@ static VPInstruction *addResumePhiRecipeForInduction(
/// \p IVEndValues.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
DenseMap<VPValue *, VPValue *> &IVEndValues) {
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPTypeAnalysis TypeInfo(Plan);
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
@@ -8675,23 +8639,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, having a basic block for the pre-header
- // which contains SCEV expansions that need to happen before the CFG is
- // modified; a basic block for the vector pre-header, followed by a region for
- // the vector loop, followed by the middle basic block. The skeleton vector
- // loop region contains a header and latch basic blocks.
-
bool RequiresScalarEpilogueCheck =
LoopVectorizationPlanner::getDecisionAndClampRange(
[this](ElementCount VF) {
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- VPlanTransforms::prepareForVectorization(
- *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
- CM.foldTailByMasking(), OrigLoop,
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()),
- Legal->hasUncountableEarlyExit(), Range);
+ VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
+ Range);
+ VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
+ CM.foldTailByMasking());
+
VPlanTransforms::createLoopRegions(*Plan);
VPlanTransforms::createExtractsForLiveOuts(*Plan);
@@ -8889,8 +8847,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
- CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -8977,11 +8934,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
- VPlanTransforms::prepareForVectorization(
- *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
- Range);
+ auto Plan = VPlanTransforms::buildVPlan0(
+ OrigLoop, *LI, Legal->getWidestInductionType(),
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+ VPlanTransforms::handleEarlyExits(*Plan,
+ /*HasUncountableExit*/ false, Range);
+ VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
+ /*TailFolded*/ false);
+
VPlanTransforms::createLoopRegions(*Plan);
for (ElementCount VF : Range)
@@ -9114,6 +9074,16 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
CurrentLinkI->getFastMathFlags());
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
VecOp = FMulRecipe;
+ } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
+ CurrentLinkI->getOpcode() == Instruction::Sub) {
+ Type *PhiTy = PhiR->getUnderlyingValue()->getType();
+ auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+ VPWidenRecipe *Sub = new VPWidenRecipe(
+ Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
+ VPIRMetadata(), CurrentLinkI->getDebugLoc());
+ Sub->setUnderlyingValue(CurrentLinkI);
+ LinkVPBB->insert(Sub, CurrentLink->getIterator());
+ VecOp = Sub;
} else {
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
if (isa<VPWidenRecipe>(CurrentLink)) {
@@ -9407,13 +9377,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- // If index is the vector trip count, the concrete value will only be set in
- // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
- // TODO: Remove the special case for the vector trip count once it is computed
- // in VPlan and can be used during VPlan simplification.
- assert((DerivedIV != Index ||
- getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
- "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
@@ -9515,8 +9478,8 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
- VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+ BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -9798,6 +9761,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
else if (&*MainScalarPH->begin() != ResumePhi)
ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
}
+ // Add a user to to make sure the resume phi won't get removed.
+ VPBuilder(MainScalarPH)
+ .createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
}
/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
@@ -10171,8 +10137,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
- CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
+ CM.CostKind);
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
@@ -10223,8 +10189,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
} else if (IC > 1 && UserIC == 1) {
// Tell the user interleaving is beneficial, but it explicitly disabled.
- LLVM_DEBUG(
- dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
+ "disabled.\n");
IntDiagMsg = {"InterleavingBeneficialButDisabled",
"the cost-model indicates that interleaving is beneficial "
"but is explicitly disabled or interleave count is set to 1"};
@@ -10295,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
+ L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
@@ -10330,20 +10296,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
BestEpiPlan);
- EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, &CM, BFI, PSI, Checks,
- *BestMainPlan);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+ BFI, PSI, Checks, *BestMainPlan);
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
- EPI.MainLoopVF = EPI.EpilogueVF;
- EPI.MainLoopUF = EPI.EpilogueUF;
- EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
- ORE, EPI, &CM, BFI, PSI,
- Checks, BestEpiPlan);
+ EpilogueVectorizerEpilogueLoop EpilogILV(
+ L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
EpilogILV.setTripCount(MainILV.getTripCount());
preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
@@ -10368,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 39011e7..37dc414 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -525,17 +525,17 @@ static bool isSplat(ArrayRef<Value *> VL) {
/// instructions, we need to use the converted opcode along with the original
/// uses.
/// \param I The instruction to check for commutativity
-/// \param InstWithUses The instruction whose uses are analyzed for special
+/// \param ValWithUses The value whose uses are analyzed for special
/// patterns
-static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
+static bool isCommutative(Instruction *I, Value *ValWithUses) {
if (auto *Cmp = dyn_cast<CmpInst>(I))
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative() ||
(BO->getOpcode() == Instruction::Sub &&
- !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+ !ValWithUses->hasNUsesOrMore(UsesLimit) &&
all_of(
- InstWithUses->uses(),
+ ValWithUses->uses(),
[](const Use &U) {
// Commutative, if icmp eq/ne sub, 0
CmpPredicate Pred;
@@ -552,8 +552,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
Flag->isOne());
})) ||
(BO->getOpcode() == Instruction::FSub &&
- !InstWithUses->hasNUsesOrMore(UsesLimit) &&
- all_of(InstWithUses->uses(), [](const Use &U) {
+ !ValWithUses->hasNUsesOrMore(UsesLimit) &&
+ all_of(ValWithUses->uses(), [](const Use &U) {
return match(U.getUser(),
m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
}));
@@ -570,6 +570,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
/// \returns true if the instruction is commutative, false otherwise
static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
+/// \returns number of operands of \p I, considering commutativity. Returns 2
+/// for commutative instrinsics.
+/// \param I The instruction to check for commutativity
+static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
+ if (isa<IntrinsicInst>(I) && isCommutative(I)) {
+ // IntrinsicInst::isCommutative returns true if swapping the first "two"
+ // arguments to the intrinsic produces the same result.
+ constexpr unsigned IntrinsicNumOperands = 2;
+ return IntrinsicNumOperands;
+ }
+ return I->getNumOperands();
+}
+
template <typename T>
static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
unsigned Offset) {
@@ -862,6 +875,16 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
}
namespace llvm {
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V);
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V);
/// Checks if the specified value does not require scheduling. It does not
/// require scheduling if all operands and all users do not need to be scheduled
/// in the current basic block.
@@ -1307,6 +1330,7 @@ public:
: MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
static InstructionsState invalid() { return {nullptr, nullptr}; }
+ /// Checks if the value is a copyable element.
bool isCopyableElement(Value *V) const {
assert(valid() && "InstructionsState is invalid.");
if (!HasCopyables)
@@ -1338,6 +1362,8 @@ public:
doesNotNeedToBeScheduled(V);
// MainOp for copyables always schedulable to correctly identify
// non-schedulable copyables.
+ if (getMainOp() == V)
+ return false;
if (isCopyableElement(V)) {
auto IsNonSchedulableCopyableElement = [this](Value *V) {
auto *I = dyn_cast<Instruction>(V);
@@ -1355,6 +1381,7 @@ public:
doesNotNeedToBeScheduled(V);
}
+ /// Checks if the state represents copyable instructions.
bool areInstructionsWithCopyableElements() const {
assert(valid() && "InstructionsState is invalid.");
return HasCopyables;
@@ -1886,6 +1913,7 @@ class BoUpSLP {
class TreeEntry;
class ScheduleEntity;
class ScheduleData;
+ class ScheduleCopyableData;
class ScheduleBundle;
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
@@ -2246,6 +2274,7 @@ public:
operator bool() const { return UserTE != nullptr; }
};
+ friend struct DenseMapInfo<EdgeInfo>;
/// A helper class used for scoring candidates for two consecutive lanes.
class LookAheadHeuristics {
@@ -2384,6 +2413,11 @@ public:
if (C1 && C2)
return LookAheadHeuristics::ScoreConstants;
+ // Consider constants and buildvector compatible.
+ if ((C1 && isa<InsertElementInst>(V2)) ||
+ (C2 && isa<InsertElementInst>(V1)))
+ return LookAheadHeuristics::ScoreConstants;
+
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
Value *EV1;
@@ -3010,10 +3044,9 @@ public:
assert(S.valid() && "InstructionsState is invalid.");
// IntrinsicInst::isCommutative returns true if swapping the first "two"
// arguments to the intrinsic produces the same result.
- constexpr unsigned IntrinsicNumOperands = 2;
Instruction *MainOp = S.getMainOp();
unsigned NumOperands = MainOp->getNumOperands();
- ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
+ ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp);
OpsVec.resize(ArgSize);
unsigned NumLanes = VL.size();
for (OperandDataVec &Ops : OpsVec)
@@ -3038,7 +3071,7 @@ public:
bool IsInverseOperation = false;
if (S.isCopyableElement(VL[Lane])) {
// The value is a copyable element.
- IsInverseOperation = !isCommutative(MainOp);
+ IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
} else {
assert(I && "Expected instruction");
auto [SelectedOp, Ops] = convertTo(I, S);
@@ -4332,7 +4365,10 @@ private:
} else {
// Build a map for gathered scalars to the nodes where they are used.
bool AllConstsOrCasts = true;
- for (Value *V : VL)
+ for (Value *V : VL) {
+ if (S && S.areInstructionsWithCopyableElements() &&
+ S.isCopyableElement(V))
+ Last->addCopyableElement(V);
if (!isConstant(V)) {
auto *I = dyn_cast<CastInst>(V);
AllConstsOrCasts &= I && I->getType()->isIntegerTy();
@@ -4340,6 +4376,7 @@ private:
!UserTreeIdx.UserTE->isGather())
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
+ }
if (AllConstsOrCasts)
CastMaxMinBWSizes =
std::make_pair(std::numeric_limits<unsigned>::max(), 1);
@@ -4518,8 +4555,6 @@ private:
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
- if (!isSimple(Inst2))
- return true;
// First check if the result is already in the cache.
AliasCacheKey Key = std::make_pair(Inst1, Inst2);
auto Res = AliasCache.try_emplace(Key);
@@ -4528,7 +4563,6 @@ private:
bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
// Store the result in the cache.
Res.first->getSecond() = Aliased;
- AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
return Aliased;
}
@@ -4587,16 +4621,18 @@ private:
/// List of hashes of vector of loads, which are known to be non vectorizable.
DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
- /// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
- /// ScheduleData used to gather dependecies for a single instructions, while
- /// ScheduleBundle represents a batch of instructions, going to be groupped
- /// together.
+ /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
+ /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
+ /// instructions, while ScheduleBundle represents a batch of instructions,
+ /// going to be groupped together. ScheduleCopyableData models extra user for
+ /// "copyable" instructions.
class ScheduleEntity {
friend class ScheduleBundle;
friend class ScheduleData;
+ friend class ScheduleCopyableData;
protected:
- enum class Kind { ScheduleData, ScheduleBundle };
+ enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
Kind getKind() const { return K; }
ScheduleEntity(Kind K) : K(K) {}
@@ -4615,17 +4651,79 @@ private:
void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
int getSchedulingPriority() const { return SchedulingPriority; }
bool isReady() const {
- if (auto *SD = dyn_cast<ScheduleData>(this))
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
return SD->isReady();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->isReady();
return cast<ScheduleBundle>(this)->isReady();
}
+ /// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
+ bool hasValidDependencies() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->hasValidDependencies();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->hasValidDependencies();
+ return cast<ScheduleBundle>(this)->hasValidDependencies();
+ }
+ /// Gets the number of unscheduled dependencies.
+ int getUnscheduledDeps() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getUnscheduledDeps();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->getUnscheduledDeps();
+ return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
+ }
+ /// Increments the number of unscheduled dependencies.
+ int incrementUnscheduledDeps(int Incr) {
+ if (auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->incrementUnscheduledDeps(Incr);
+ return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
+ }
+ /// Gets the number of dependencies.
+ int getDependencies() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getDependencies();
+ return cast<ScheduleCopyableData>(this)->getDependencies();
+ }
+ /// Gets the instruction.
+ Instruction *getInst() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getInst();
+ return cast<ScheduleCopyableData>(this)->getInst();
+ }
+
/// Gets/sets if the bundle is scheduled.
bool isScheduled() const { return IsScheduled; }
void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
static bool classof(const ScheduleEntity *) { return true; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(raw_ostream &OS) const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->dump(OS);
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->dump(OS);
+ return cast<ScheduleBundle>(this)->dump(OS);
+ }
+
+ LLVM_DUMP_METHOD void dump() const {
+ dump(dbgs());
+ dbgs() << '\n';
+ }
+#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const BoUpSLP::ScheduleEntity &SE) {
+ SE.dump(OS);
+ return OS;
+ }
+#endif
+
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
@@ -4688,10 +4786,18 @@ private:
/// Clears all dependency information.
void clearDependencies() {
- Dependencies = InvalidDeps;
- resetUnscheduledDeps();
+ clearDirectDependencies();
MemoryDependencies.clear();
ControlDependencies.clear();
+ }
+
+ /// Clears all direct dependencies only, except for control and memory
+ /// dependencies.
+ /// Required for copyable elements to correctly handle control/memory deps
+ /// and avoid extra reclaculation of such deps.
+ void clearDirectDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
IsScheduled = false;
}
@@ -4781,7 +4887,7 @@ private:
class ScheduleBundle final : public ScheduleEntity {
/// The schedule data for the instructions in the bundle.
- SmallVector<ScheduleData *> Bundle;
+ SmallVector<ScheduleEntity *> Bundle;
/// True if this bundle is valid.
bool IsValid = true;
/// The TreeEntry that this instruction corresponds to.
@@ -4797,7 +4903,7 @@ private:
/// Verify basic self consistency properties
void verify() const {
- for (const ScheduleData *SD : Bundle) {
+ for (const ScheduleEntity *SD : Bundle) {
if (SD->hasValidDependencies()) {
assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
"invariant");
@@ -4817,7 +4923,7 @@ private:
int unscheduledDepsInBundle() const {
assert(*this && "bundle must not be empty");
int Sum = 0;
- for (const ScheduleData *BundleMember : Bundle) {
+ for (const ScheduleEntity *BundleMember : Bundle) {
if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
return ScheduleData::InvalidDeps;
Sum += BundleMember->getUnscheduledDeps();
@@ -4829,7 +4935,7 @@ private:
/// Note that depenendency validity can vary between instructions within
/// a single bundle.
bool hasValidDependencies() const {
- return all_of(Bundle, [](const ScheduleData *SD) {
+ return all_of(Bundle, [](const ScheduleEntity *SD) {
return SD->hasValidDependencies();
});
}
@@ -4843,10 +4949,10 @@ private:
/// Returns the bundle of scheduling data, associated with the current
/// instruction.
- ArrayRef<ScheduleData *> getBundle() { return Bundle; }
- ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
+ ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
+ ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
/// Adds an instruction to the bundle.
- void add(ScheduleData *SD) { Bundle.push_back(SD); }
+ void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
/// Gets/sets the associated tree entry.
void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
@@ -4863,8 +4969,11 @@ private:
return;
}
OS << '[';
- interleaveComma(Bundle, OS,
- [&](const ScheduleData *SD) { OS << *SD->getInst(); });
+ interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
+ if (isa<ScheduleCopyableData>(SD))
+ OS << "<Copyable>";
+ OS << *SD->getInst();
+ });
OS << ']';
}
@@ -4883,6 +4992,129 @@ private:
}
#endif
+ /// Contains all scheduling relevant data for the copyable instruction.
+ /// It models the virtual instructions, supposed to replace the original
+ /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
+ /// %1], where %1 = add, then the ScheduleCopyableData models virtual
+ /// instruction %virt = add %0, 0.
+ class ScheduleCopyableData final : public ScheduleEntity {
+ /// The source schedule data for the instruction.
+ Instruction *Inst = nullptr;
+ /// The edge information for the instruction.
+ const EdgeInfo EI;
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID = 0;
+ /// Bundle, this data is part of.
+ ScheduleBundle &Bundle;
+
+ public:
+ ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
+ const EdgeInfo &EI, ScheduleBundle &Bundle)
+ : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
+ SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
+ static bool classof(const ScheduleEntity *Entity) {
+ return Entity->getKind() == Kind::ScheduleCopyableData;
+ }
+
+ /// Verify basic self consistency properties
+ void verify() {
+ if (hasValidDependencies()) {
+ assert(UnscheduledDeps <= Dependencies && "invariant");
+ } else {
+ assert(UnscheduledDeps == Dependencies && "invariant");
+ }
+
+ if (IsScheduled) {
+ assert(hasValidDependencies() && UnscheduledDeps == 0 &&
+ "unexpected scheduled state");
+ }
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
+ bool hasValidDependencies() const {
+ return Dependencies != ScheduleData::InvalidDeps;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
+
+ /// Modifies the number of unscheduled dependencies for this instruction,
+ /// and returns the number of remaining dependencies for the containing
+ /// bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ assert(hasValidDependencies() &&
+ "increment of unscheduled deps would be meaningless");
+ UnscheduledDeps += Incr;
+ assert(UnscheduledDeps >= 0 && "invariant");
+ return UnscheduledDeps;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
+
+ /// Gets the number of unscheduled dependencies.
+ int getUnscheduledDeps() const { return UnscheduledDeps; }
+ /// Gets the number of dependencies.
+ int getDependencies() const { return Dependencies; }
+ /// Initializes the number of dependencies.
+ void initDependencies() { Dependencies = 0; }
+ /// Increments the number of dependencies.
+ void incDependencies() { Dependencies++; }
+
+ /// Gets scheduling region ID.
+ int getSchedulingRegionID() const { return SchedulingRegionID; }
+
+ /// Gets the instruction.
+ Instruction *getInst() const { return Inst; }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = ScheduleData::InvalidDeps;
+ UnscheduledDeps = ScheduleData::InvalidDeps;
+ IsScheduled = false;
+ }
+
+ /// Gets the edge information.
+ const EdgeInfo &getEdgeInfo() const { return EI; }
+
+ /// Gets the bundle.
+ ScheduleBundle &getBundle() { return Bundle; }
+ const ScheduleBundle &getBundle() const { return Bundle; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
+
+ LLVM_DUMP_METHOD void dump() const {
+ dump(dbgs());
+ dbgs() << '\n';
+ }
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+ private:
+ /// true, if it has valid dependency information. These nodes always have
+ /// only single dependency.
+ int Dependencies = ScheduleData::InvalidDeps;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps = ScheduleData::InvalidDeps;
+ };
+
+#ifndef NDEBUG
+ friend inline raw_ostream &
+ operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
+ SD.dump(OS);
+ return OS;
+ }
+#endif
+
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
@@ -4909,6 +5141,10 @@ private:
void clear() {
ScheduledBundles.clear();
ScheduledBundlesList.clear();
+ ScheduleCopyableDataMap.clear();
+ ScheduleCopyableDataMapByInst.clear();
+ ScheduleCopyableDataMapByInstUser.clear();
+ ScheduleCopyableDataMapByUsers.clear();
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
@@ -4935,7 +5171,7 @@ private:
// Avoid lookup if can't possibly be in map.
return nullptr;
ScheduleData *SD = ScheduleDataMap.lookup(I);
- if (SD && isInSchedulingRegion(SD))
+ if (SD && isInSchedulingRegion(*SD))
return SD;
return nullptr;
}
@@ -4944,6 +5180,201 @@ private:
return getScheduleData(dyn_cast<Instruction>(V));
}
+ /// Returns the ScheduleCopyableData for the given edge (user tree entry and
+ /// operand number) and value.
+ ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
+ const Value *V) const {
+ if (ScheduleCopyableDataMap.empty())
+ return nullptr;
+ auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
+ if (It == ScheduleCopyableDataMap.end())
+ return nullptr;
+ ScheduleCopyableData *SD = It->getSecond().get();
+ if (!isInSchedulingRegion(*SD))
+ return nullptr;
+ return SD;
+ }
+
+ /// Returns the ScheduleCopyableData for the given user \p User, operand
+ /// number and operand \p V.
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableData(const Value *User, unsigned OperandIdx,
+ const Value *V) {
+ if (ScheduleCopyableDataMapByInstUser.empty())
+ return {};
+ const auto It = ScheduleCopyableDataMapByInstUser.find(
+ std::make_pair(std::make_pair(User, OperandIdx), V));
+ if (It == ScheduleCopyableDataMapByInstUser.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ /// Returns true if all operands of the given instruction \p User are
+ /// replaced by copyable data.
+ /// \param User The user instruction.
+ /// \param Op The operand, which might be replaced by the copyable data.
+ /// \param SLP The SLP tree.
+ /// \param NumOps The number of operands used. If the instruction uses the
+ /// same operand several times, check for the first use, then the second,
+ /// etc.
+ bool areAllOperandsReplacedByCopyableData(Instruction *User,
+ Instruction *Op, BoUpSLP &SLP,
+ unsigned NumOps) const {
+ assert(NumOps > 0 && "No operands");
+ if (ScheduleCopyableDataMap.empty())
+ return false;
+ SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+ SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
+ for (const Use &U : User->operands()) {
+ if (U.get() != Op)
+ continue;
+ ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
+ if (Entries.empty())
+ return false;
+ // Check all tree entries, if they have operands replaced by copyable
+ // data.
+ for (TreeEntry *TE : Entries) {
+ // Check if the user is commutative.
+ // The commutatives are handled later, as their oeprands can be
+ // reordered.
+ // Same applies even for non-commutative cmps, because we can invert
+ // their predicate potentially and, thus, reorder the operands.
+ bool IsCommutativeUser =
+ ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+ EdgeInfo EI(TE, U.getOperandNo());
+ if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+ unsigned &OpCnt =
+ OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+ if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+ return false;
+ // Found copyable operand - continue.
+ ++OpCnt;
+ continue;
+ }
+ ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
+ .first->getSecond();
+ }
+ }
+ // Check the commutative/cmp entries.
+ if (!PotentiallyReorderedEntriesCount.empty()) {
+ for (auto &P : PotentiallyReorderedEntriesCount) {
+ auto *It = find(P.first->Scalars, User);
+ assert(It != P.first->Scalars.end() &&
+ "User is not in the tree entry");
+ int Lane = std::distance(P.first->Scalars.begin(), It);
+ assert(Lane >= 0 && "Lane is not found");
+ if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
+ Lane = P.first->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
+ "Couldn't find extract lane");
+ SmallVector<unsigned> OpIndices;
+ for (unsigned OpIdx :
+ seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
+ P.first->getMainOp()))) {
+ if (P.first->getOperand(OpIdx)[Lane] == Op &&
+ getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
+ --P.getSecond();
+ }
+ }
+ return all_of(PotentiallyReorderedEntriesCount,
+ [&](const std::pair<const TreeEntry *, unsigned> &P) {
+ return P.second == NumOps - 1;
+ });
+ }
+ return true;
+ }
+
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableData(const Instruction *I) const {
+ if (ScheduleCopyableDataMapByInst.empty())
+ return {};
+ const auto It = ScheduleCopyableDataMapByInst.find(I);
+ if (It == ScheduleCopyableDataMapByInst.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableDataUsers(const Instruction *User) const {
+ if (ScheduleCopyableDataMapByUsers.empty())
+ return {};
+ const auto It = ScheduleCopyableDataMapByUsers.find(User);
+ if (It == ScheduleCopyableDataMapByUsers.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
+ Instruction *I,
+ int SchedulingRegionID,
+ ScheduleBundle &Bundle) {
+ assert(!getScheduleCopyableData(EI, I) && "already in the map");
+ ScheduleCopyableData *CD =
+ ScheduleCopyableDataMap
+ .try_emplace(std::make_pair(EI, I),
+ std::make_unique<ScheduleCopyableData>(
+ SchedulingRegionID, I, EI, Bundle))
+ .first->getSecond()
+ .get();
+ ScheduleCopyableDataMapByInst[I].push_back(CD);
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, I);
+ assert(It != Op.end() && "Lane not set");
+ SmallPtrSet<Instruction *, 4> Visited;
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ if (!Visited.insert(In).second) {
+ It = find(make_range(std::next(It), Op.end()), I);
+ continue;
+ }
+ ScheduleCopyableDataMapByInstUser
+ .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
+ .first->getSecond()
+ .push_back(CD);
+ ScheduleCopyableDataMapByUsers.try_emplace(I)
+ .first->getSecond()
+ .insert(CD);
+ // Remove extra deps for users, becoming non-immediate users of the
+ // instruction. It may happen, if the chain of same copyable elements
+ // appears in the tree.
+ if (In == I) {
+ EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+ if (ScheduleCopyableData *UserCD =
+ getScheduleCopyableData(UserEI, In))
+ ScheduleCopyableDataMapByUsers[I].remove(UserCD);
+ }
+ It = find(make_range(std::next(It), Op.end()), I);
+ } while (It != Op.end());
+ } else {
+ ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
+ CD);
+ }
+ return *CD;
+ }
+
ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
auto *I = dyn_cast<Instruction>(V);
if (!I)
@@ -4954,34 +5385,44 @@ private:
return It->getSecond();
}
- bool isInSchedulingRegion(ScheduleData *SD) const {
- return SD->getSchedulingRegionID() == SchedulingRegionID;
- }
-
- bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
- return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) {
- return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
- });
+ /// Returns true if the entity is in the scheduling region.
+ bool isInSchedulingRegion(const ScheduleEntity &SD) const {
+ if (const auto *Data = dyn_cast<ScheduleData>(&SD))
+ return Data->getSchedulingRegionID() == SchedulingRegionID;
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
+ return CD->getSchedulingRegionID() == SchedulingRegionID;
+ return all_of(cast<ScheduleBundle>(SD).getBundle(),
+ [&](const ScheduleEntity *BundleMember) {
+ return isInSchedulingRegion(*BundleMember);
+ });
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
- void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
- auto ProcessBundleMember = [&](ScheduleData *BundleMember,
- ScheduleBundle *Bundle) {
+ void schedule(const BoUpSLP &R, const InstructionsState &S,
+ const EdgeInfo &EI, ScheduleEntity *Data,
+ ReadyListType &ReadyList) {
+ auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
+ ArrayRef<ScheduleBundle *> Bundles) {
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
- auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
+ auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
if ((IsControl || Data->hasValidDependencies()) &&
Data->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after
// decrementing, so we can put the dependent instruction
// into the ready list.
- if (ArrayRef<ScheduleBundle *> Bundles =
- getScheduleBundles(Data->getInst());
- !Bundles.empty()) {
+ SmallVector<ScheduleBundle *, 1> CopyableBundle;
+ ArrayRef<ScheduleBundle *> Bundles;
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
+ CopyableBundle.push_back(&CD->getBundle());
+ Bundles = CopyableBundle;
+ } else {
+ Bundles = getScheduleBundles(Data->getInst());
+ }
+ if (!Bundles.empty()) {
for (ScheduleBundle *Bundle : Bundles) {
if (Bundle->unscheduledDepsInBundle() == 0) {
assert(!Bundle->isScheduled() &&
@@ -4995,12 +5436,23 @@ private:
}
assert(!Data->isScheduled() &&
"already scheduled bundle gets ready");
+ assert(!isa<ScheduleCopyableData>(Data) &&
+ "Expected non-copyable data");
ReadyList.insert(Data);
LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
}
};
- auto DecrUnschedForInst = [&](Instruction *I) {
+ auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
+ Instruction *I) {
+ if (!ScheduleCopyableDataMap.empty()) {
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ getScheduleCopyableData(User, OpIdx, I);
+ for (ScheduleCopyableData *CD : CopyableData)
+ DecrUnsched(CD, /*IsControl=*/false);
+ if (!CopyableData.empty())
+ return;
+ }
if (ScheduleData *OpSD = getScheduleData(I))
DecrUnsched(OpSD, /*IsControl=*/false);
};
@@ -5008,45 +5460,101 @@ private:
// If BundleMember is a vector bundle, its operands may have been
// reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
- if (Bundle) {
- // Need to search for the lane since the tree entry can be reordered.
+ if (!Bundles.empty()) {
auto *In = BundleMember->getInst();
- int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
- find(Bundle->getTreeEntry()->Scalars, In));
- assert(Lane >= 0 && "Lane not set");
-
- // Since vectorization tree is being built recursively this assertion
- // ensures that the tree entry has all operands set before reaching
- // this code. Couple of exceptions known at the moment are extracts
- // where their second (immediate) operand is not added. Since
- // immediates do not affect scheduler behavior this is considered
- // okay.
- assert(In &&
- (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
- In->getNumOperands() ==
- Bundle->getTreeEntry()->getNumOperands()) &&
- "Missed TreeEntry operands?");
-
- for (unsigned OpIdx :
- seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
- if (auto *I = dyn_cast<Instruction>(
- Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
- LLVM_DEBUG(dbgs()
- << "SLP: check for readiness (def): " << *I << "\n");
- DecrUnschedForInst(I);
+ // Count uses of each instruction operand.
+ SmallDenseMap<const Instruction *, unsigned> OperandsUses;
+ unsigned TotalOpCount = 0;
+ if (isa<ScheduleCopyableData>(BundleMember)) {
+ // Copyable data is used only once (uses itself).
+ TotalOpCount = OperandsUses[In] = 1;
+ } else {
+ for (const Use &U : In->operands()) {
+ if (auto *I = dyn_cast<Instruction>(U.get())) {
+ auto Res = OperandsUses.try_emplace(I, 0);
+ ++Res.first->getSecond();
+ ++TotalOpCount;
+ }
+ }
+ }
+ // Decrement the unscheduled counter and insert to ready list if
+ // ready.
+ auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
+ unsigned OpIdx) {
+ if (!ScheduleCopyableDataMap.empty()) {
+ const EdgeInfo EI = {UserTE, OpIdx};
+ if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
+ DecrUnsched(CD, /*IsControl=*/false);
+ return;
+ }
+ }
+ auto It = OperandsUses.find(I);
+ assert(It != OperandsUses.end() && "Operand not found");
+ if (It->second > 0) {
+ --It->getSecond();
+ assert(TotalOpCount > 0 && "No more operands to decrement");
+ --TotalOpCount;
+ if (ScheduleData *OpSD = getScheduleData(I))
+ DecrUnsched(OpSD, /*IsControl=*/false);
}
+ };
+
+ for (ScheduleBundle *Bundle : Bundles) {
+ if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
+ break;
+ // Need to search for the lane since the tree entry can be
+ // reordered.
+ int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
+ find(Bundle->getTreeEntry()->Scalars, In));
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(In) &&
+ !Bundle->getTreeEntry()->ReorderIndices.empty())
+ Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(
+ Bundle->getTreeEntry()->Scalars.size()) &&
+ "Couldn't find extract lane");
+
+ // Since vectorization tree is being built recursively this
+ // assertion ensures that the tree entry has all operands set before
+ // reaching this code. Couple of exceptions known at the moment are
+ // extracts where their second (immediate) operand is not added.
+ // Since immediates do not affect scheduler behavior this is
+ // considered okay.
+ assert(In &&
+ (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+ In->getNumOperands() ==
+ Bundle->getTreeEntry()->getNumOperands() ||
+ Bundle->getTreeEntry()->isCopyableElement(In)) &&
+ "Missed TreeEntry operands?");
+
+ for (unsigned OpIdx :
+ seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+ if (auto *I = dyn_cast<Instruction>(
+ Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+ LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
+ << "\n");
+ DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
+ }
+ }
} else {
// If BundleMember is a stand-alone instruction, no operand reordering
// has taken place, so we directly access its operands.
- for (Use &U : BundleMember->getInst()->operands())
+ for (Use &U : BundleMember->getInst()->operands()) {
if (auto *I = dyn_cast<Instruction>(U.get())) {
LLVM_DEBUG(dbgs()
<< "SLP: check for readiness (def): " << *I << "\n");
- DecrUnschedForInst(I);
+ DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
}
+ }
}
// Handle the memory dependencies.
- for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
+ auto *SD = dyn_cast<ScheduleData>(BundleMember);
+ if (!SD)
+ return;
+ SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
+ for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
+ if (!VisitedMemory.insert(MemoryDep).second)
+ continue;
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
@@ -5054,7 +5562,10 @@ private:
DecrUnsched(MemoryDep);
}
// Handle the control dependencies.
- for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
+ SmallPtrSet<const ScheduleData *, 4> VisitedControl;
+ for (ScheduleData *Dep : SD->getControlDependencies()) {
+ if (!VisitedControl.insert(Dep).second)
+ continue;
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
LLVM_DEBUG(dbgs()
@@ -5065,23 +5576,29 @@ private:
if (auto *SD = dyn_cast<ScheduleData>(Data)) {
SD->setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
- ProcessBundleMember(SD, nullptr);
+ ProcessBundleMember(SD, {});
} else {
ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
Bundle.setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
- auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
- ArrayRef<ScheduleBundle *> SDBundles =
- getScheduleBundles(SD->getInst());
- return !SDBundles.empty() &&
- all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
- return SDBundle->isScheduled();
- });
- };
- for (ScheduleData *SD : Bundle.getBundle()) {
- if (AreAllBundlesScheduled(SD)) {
+ auto AreAllBundlesScheduled =
+ [&](const ScheduleEntity *SD,
+ ArrayRef<ScheduleBundle *> SDBundles) {
+ if (isa<ScheduleCopyableData>(SD))
+ return true;
+ return !SDBundles.empty() &&
+ all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
+ return SDBundle->isScheduled();
+ });
+ };
+ for (ScheduleEntity *SD : Bundle.getBundle()) {
+ ArrayRef<ScheduleBundle *> SDBundles;
+ if (!isa<ScheduleCopyableData>(SD))
+ SDBundles = getScheduleBundles(SD->getInst());
+ if (AreAllBundlesScheduled(SD, SDBundles)) {
SD->setScheduled(/*Scheduled=*/true);
- ProcessBundleMember(SD, &Bundle);
+ ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
+ : SDBundles);
}
}
}
@@ -5109,7 +5626,7 @@ private:
auto *SD = getScheduleData(I);
if (!SD)
continue;
- assert(isInSchedulingRegion(SD) &&
+ assert(isInSchedulingRegion(*SD) &&
"primary schedule data not in window?");
SD->verify();
}
@@ -5150,8 +5667,11 @@ private:
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
+ /// \param VL The list of scalar instructions.
+ /// \param S The state of the instructions.
+ /// \param EI The edge in the SLP graph or the user node/operand number.
ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
- const InstructionsState &S);
+ const InstructionsState &S, const EdgeInfo &EI);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -5160,7 +5680,7 @@ private:
/// std::nullopt if \p VL is allowed to be scheduled.
std::optional<ScheduleBundle *>
tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
+ const InstructionsState &S, const EdgeInfo &EI);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
@@ -5178,7 +5698,8 @@ private:
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
- BoUpSLP *SLP);
+ BoUpSLP *SLP,
+ ArrayRef<ScheduleData *> ControlDeps = {});
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
@@ -5200,6 +5721,48 @@ private:
/// ScheduleData structures are recycled.
SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+ /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
+ /// number) and the operand instruction, represented as copyable element.
+ SmallDenseMap<std::pair<EdgeInfo, const Value *>,
+ std::unique_ptr<ScheduleCopyableData>>
+ ScheduleCopyableDataMap;
+
+ /// Represents mapping between instruction and all related
+ /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
+ /// element). The SLP tree may contain several representations of the same
+ /// instruction.
+ SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
+ ScheduleCopyableDataMapByInst;
+
+ /// Represents mapping between user value and operand number, the operand
+ /// value and all related ScheduleCopyableData. The relation is 1:n, because
+ /// the same user may refernce the same operand in different tree entries
+ /// and the operand may be modelled by the different copyable data element.
+ SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
+ SmallVector<ScheduleCopyableData *>>
+ ScheduleCopyableDataMapByInstUser;
+
+ /// Represents mapping between instruction and all related
+ /// ScheduleCopyableData. It represents the mapping between the actual
+ /// instruction and the last copyable data element in the chain. E.g., if
+ /// the graph models the following instructions:
+ /// %0 = non-add instruction ...
+ /// ...
+ /// %4 = add %3, 1
+ /// %5 = add %4, 1
+ /// %6 = insertelement poison, %0, 0
+ /// %7 = insertelement %6, %5, 1
+ /// And the graph is modeled as:
+ /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
+ /// -> [1, 0] -> [%1, 0]
+ ///
+ /// this map will map %0 only to the copyable element <1>, which is the last
+ /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
+ /// keep the map to <0>, not the %0.
+ SmallDenseMap<const Instruction *,
+ SmallSetVector<ScheduleCopyableData *, 4>>
+ ScheduleCopyableDataMapByUsers;
+
/// Attaches ScheduleBundle to Instruction.
SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
ScheduledBundles;
@@ -5246,7 +5809,7 @@ private:
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
- void scheduleBlock(BlockScheduling *BS);
+ void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -5319,6 +5882,30 @@ private:
} // end namespace slpvectorizer
+template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
+ using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
+ using SecondInfo = DenseMapInfo<unsigned>;
+ static BoUpSLP::EdgeInfo getEmptyKey() {
+ return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
+ SecondInfo::getEmptyKey());
+ }
+
+ static BoUpSLP::EdgeInfo getTombstoneKey() {
+ return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
+ SecondInfo::getTombstoneKey());
+ }
+
+ static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
+ return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
+ SecondInfo::getHashValue(Val.EdgeIdx));
+ }
+
+ static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
+ const BoUpSLP::EdgeInfo &RHS) {
+ return LHS == RHS;
+ }
+};
+
template <> struct GraphTraits<BoUpSLP *> {
using TreeEntry = BoUpSLP::TreeEntry;
@@ -7195,12 +7782,45 @@ bool BoUpSLP::isProfitableToReorder() const {
// Check if the tree has only single store and single (unordered) load node,
// other nodes are phis or geps/binops, combined with phis, and/or single
// gather load node
- bool HasPhis = false;
if (VectorizableTree.front()->hasState() &&
VectorizableTree.front()->getOpcode() == Instruction::PHI &&
VectorizableTree.front()->Scalars.size() == TinyVF &&
VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
return false;
+ // Single node, which require reorder - skip.
+ if (VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::Store &&
+ VectorizableTree.front()->ReorderIndices.empty()) {
+ const unsigned ReorderedSplitsCnt =
+ count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::SplitVectorize &&
+ !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
+ TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+ ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
+ });
+ if (ReorderedSplitsCnt <= 1 &&
+ static_cast<unsigned>(count_if(
+ VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return ((!TE->isGather() &&
+ (TE->ReorderIndices.empty() ||
+ (TE->UserTreeIndex.UserTE &&
+ TE->UserTreeIndex.UserTE->State ==
+ TreeEntry::Vectorize &&
+ !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
+ .empty()))) ||
+ (TE->isGather() && TE->ReorderIndices.empty() &&
+ (!TE->hasState() || TE->isAltShuffle() ||
+ TE->getOpcode() == Instruction::Load ||
+ TE->getOpcode() == Instruction::ZExt ||
+ TE->getOpcode() == Instruction::SExt))) &&
+ (VectorizableTree.front()->getVectorFactor() > TinyVF ||
+ !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
+ return !isConstant(V) && isVectorized(V);
+ }));
+ })) >= VectorizableTree.size() - ReorderedSplitsCnt)
+ return false;
+ }
+ bool HasPhis = false;
bool HasLoad = true;
unsigned GatherLoads = 0;
for (const std::unique_ptr<TreeEntry> &TE :
@@ -9772,7 +10392,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
}))) {
if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
S.getMainOp()->isSafeToRemove() &&
- all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
+ (S.areInstructionsWithCopyableElements() ||
+ all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
// Find the number of elements, which forms full vectors.
unsigned PWSz = getFullVectorNumberOfElements(
TTI, UniqueValues.front()->getType(), UniqueValues.size());
@@ -9789,8 +10410,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PaddedUniqueValues.append(
PWSz - UniqueValues.size(),
PoisonValue::get(UniqueValues.front()->getType()));
- // Check that extended with poisons operations are still valid for
- // vectorization (div/rem are not allowed).
+ // Check that extended with poisons/copyable operations are still valid
+ // for vectorization (div/rem are not allowed).
if (!S.areInstructionsWithCopyableElements() &&
!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
@@ -9952,35 +10573,41 @@ class InstructionsCompatibilityAnalysis {
unsigned MainOpcode = 0;
Instruction *MainOp = nullptr;
+ /// Checks if the opcode is supported as the main opcode for copyable
+ /// elements.
+ static bool isSupportedOpcode(const unsigned Opcode) {
+ return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+ }
+
/// Identifies the best candidate value, which represents main opcode
/// operation.
/// Currently the best candidate is the Add instruction with the parent
/// block with the highest DFS incoming number (block, that dominates other).
- void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+ void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
BasicBlock *Parent = nullptr;
// Checks if the instruction has supported opcode.
- auto IsSupportedOpcode = [](Instruction *I) {
- return I && I->getOpcode() == Instruction::Add;
+ auto IsSupportedInstruction = [&](Instruction *I) {
+ return I && isSupportedOpcode(I->getOpcode()) &&
+ (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
};
+ // Exclude operands instructions immediately to improve compile time, it
+ // will be unable to schedule anyway.
SmallDenseSet<Value *, 8> Operands;
+ SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
for (Value *V : VL) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
if (!DT.isReachableFromEntry(I->getParent()))
continue;
- if (!MainOp) {
- MainOp = I;
+ if (Candidates.empty()) {
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Parent = I->getParent();
Operands.insert(I->op_begin(), I->op_end());
continue;
}
if (Parent == I->getParent()) {
- if (!IsSupportedOpcode(MainOp))
- MainOp = I;
- if (MainOp->getOpcode() == I->getOpcode() &&
- doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
- MainOp = I;
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Operands.insert(I->op_begin(), I->op_end());
continue;
}
@@ -9992,24 +10619,35 @@ class InstructionsCompatibilityAnalysis {
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
- MainOp = I;
+ Candidates.clear();
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Parent = I->getParent();
Operands.clear();
Operands.insert(I->op_begin(), I->op_end());
}
}
- if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
- MainOp = nullptr;
- return;
+ unsigned BestOpcodeNum = 0;
+ MainOp = nullptr;
+ for (const auto &P : Candidates) {
+ if (P.second.size() < BestOpcodeNum)
+ continue;
+ for (Instruction *I : P.second) {
+ if (IsSupportedInstruction(I) && !Operands.contains(I)) {
+ MainOp = I;
+ BestOpcodeNum = P.second.size();
+ break;
+ }
+ }
}
- MainOpcode = MainOp->getOpcode();
+ if (MainOp)
+ MainOpcode = MainOp->getOpcode();
}
/// Returns the idempotent value for the \p MainOp with the detected \p
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
/// the operand itself, since V or V == V.
Value *selectBestIdempotentValue() const {
- assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+ assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
!MainOp->isCommutative());
}
@@ -10022,13 +10660,8 @@ class InstructionsCompatibilityAnalysis {
return {V, V};
if (!S.isCopyableElement(V))
return convertTo(cast<Instruction>(V), S).second;
- switch (MainOpcode) {
- case Instruction::Add:
- return {V, selectBestIdempotentValue()};
- default:
- break;
- }
- llvm_unreachable("Unsupported opcode");
+ assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
+ return {V, selectBestIdempotentValue()};
}
/// Builds operands for the original instructions.
@@ -10202,16 +10835,10 @@ public:
return S;
if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
return S;
- findAndSetMainInstruction(VL);
+ findAndSetMainInstruction(VL, R);
if (!MainOp)
return InstructionsState::invalid();
S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
- // TODO: Remove this check once support for schulable copyables is landed.
- if (any_of(VL, [&](Value *V) {
- return S.isCopyableElement(V) && !S.isNonSchedulable(V);
- }))
- return InstructionsState::invalid();
-
if (!WithProfitabilityCheck)
return S;
// Check if it is profitable to vectorize the instruction.
@@ -10247,6 +10874,21 @@ public:
}
if (!Res)
return InstructionsState::invalid();
+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+ InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
+ InstructionCost VectorCost;
+ FixedVectorType *VecTy =
+ getWidenedType(S.getMainOp()->getType(), VL.size());
+ switch (MainOpcode) {
+ case Instruction::Add:
+ case Instruction::LShr:
+ VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ if (VectorCost > ScalarCost)
+ return InstructionsState::invalid();
return S;
}
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -10731,7 +11373,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
SetVector<Value *> UniqueValues(llvm::from_range, VL);
std::optional<ScheduleBundle *> BundlePtr =
- BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S);
+ BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
@@ -11991,6 +12633,8 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
}
}
+/// Check if we can convert fadd/fsub sequence to FMAD.
+/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
const InstructionsState &S,
DominatorTree &DT, const DataLayout &DL,
@@ -12010,7 +12654,8 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
- // TODO: support for copyable elements.
+ if (S.isCopyableElement(I))
+ continue;
Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
continue;
@@ -12028,6 +12673,7 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
if (!OpS.valid())
return InstructionCost::getInvalid();
+
if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
return InstructionCost::getInvalid();
if (!CheckForContractable(Operands.front()))
@@ -12042,15 +12688,19 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
- if (auto *FPCI = dyn_cast<FPMathOperator>(I))
- FMF &= FPCI->getFastMathFlags();
+ if (!S.isCopyableElement(I))
+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+ FMF &= FPCI->getFastMathFlags();
FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
}
unsigned NumOps = 0;
for (auto [V, Op] : zip(VL, Operands.front())) {
+ if (S.isCopyableElement(V))
+ continue;
auto *I = dyn_cast<Instruction>(Op);
- if (!I || !I->hasOneUse()) {
- FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
+ if (auto *OpI = dyn_cast<Instruction>(V))
+ FMACost += TTI.getInstructionCost(OpI, CostKind);
if (I)
FMACost += TTI.getInstructionCost(I, CostKind);
continue;
@@ -14687,6 +15337,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
})))))
return true;
+ // If the tree contains only buildvector, 2 non-buildvectors (with root user
+ // tree node) and other buildvectors, we can skip it.
+ if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+ VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
+ VectorizableTree.size() >= Limit &&
+ count_if(ArrayRef(VectorizableTree).drop_front(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return !TE->isGather() && TE->UserTreeIndex.UserTE &&
+ TE->UserTreeIndex.UserTE->Idx == 0;
+ }) == 2)
+ return true;
+
+ // If the tree contains only vectorization of the phi node from the
+ // buildvector - skip it.
+ if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+ VectorizableTree.size() > 2 &&
+ VectorizableTree.front()->State == TreeEntry::Vectorize &&
+ VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+ VectorizableTree[1]->State == TreeEntry::Vectorize &&
+ VectorizableTree[1]->getOpcode() == Instruction::PHI &&
+ all_of(
+ ArrayRef(VectorizableTree).drop_front(2),
+ [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
+ return true;
+
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -19234,7 +19909,7 @@ Value *BoUpSLP::vectorizeTree(
EntryToLastInstruction.clear();
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules)
- scheduleBlock(BSIter.second.get());
+ scheduleBlock(*this, BSIter.second.get());
// Cache last instructions for the nodes to avoid side effects, which may
// appear during vectorization, like extra uses, etc.
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
@@ -20041,24 +20716,29 @@ void BoUpSLP::optimizeGatherSequence() {
GatherShuffleExtractSeq.clear();
}
-BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
- const InstructionsState &S) {
+BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
+ ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
auto &BundlePtr =
ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (S.isNonSchedulable(V))
continue;
- if (S.isCopyableElement(V))
+ auto *I = cast<Instruction>(V);
+ if (S.isCopyableElement(V)) {
+ // Add a copyable element model.
+ ScheduleCopyableData &SD =
+ addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
+ // Group the instructions to a bundle.
+ BundlePtr->add(&SD);
continue;
+ }
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)");
// Group the instructions to a bundle.
BundlePtr->add(BundleMember);
- ScheduledBundles.try_emplace(cast<Instruction>(V))
- .first->getSecond()
- .push_back(BundlePtr.get());
+ ScheduledBundles.try_emplace(I).first->getSecond().push_back(
+ BundlePtr.get());
}
assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
return *BundlePtr;
@@ -20068,7 +20748,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
// and schedules instructions until the bundle gets ready.
std::optional<BoUpSLP::ScheduleBundle *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S) {
+ const InstructionsState &S,
+ const EdgeInfo &EI) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
bool HasCopyables = S.areInstructionsWithCopyableElements();
@@ -20078,33 +20759,83 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
return nullptr;
- // TODO Remove once full support for copyables is landed.
- assert(all_of(VL,
- [&](Value *V) {
- return !S.isCopyableElement(V) || S.isNonSchedulable(V);
- }) &&
- "Copyable elements should not be schedulable");
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
+ // Clear deps or recalculate the region, if the memory instruction is a
+ // copyable. It may have memory deps, which must be recalculated.
+ SmallVector<ScheduleData *> ControlDependentMembers;
+ auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
+ SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
+ for (ScheduleEntity *SE : Bundle.getBundle()) {
+ if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
+ if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
+ BundleMember && BundleMember->hasValidDependencies()) {
+ BundleMember->clearDirectDependencies();
+ if (RegionHasStackSave ||
+ !isGuaranteedToTransferExecutionToSuccessor(
+ BundleMember->getInst()))
+ ControlDependentMembers.push_back(BundleMember);
+ }
+ continue;
+ }
+ auto *SD = cast<ScheduleData>(SE);
+ for (const Use &U : SD->getInst()->operands()) {
+ unsigned &NumOps =
+ UserOpToNumOps
+ .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
+ .first->getSecond();
+ ++NumOps;
+ if (auto *Op = dyn_cast<Instruction>(U.get());
+ Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
+ *SLP, NumOps)) {
+ if (ScheduleData *OpSD = getScheduleData(Op)) {
+ OpSD->clearDirectDependencies();
+ if (RegionHasStackSave ||
+ !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+ ControlDependentMembers.push_back(OpSD);
+ }
+ }
+ }
+ }
+ };
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
- for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- if (ScheduleData *SD = getScheduleData(I))
+ for_each(ScheduleDataMap, [&](auto &P) {
+ if (BB != P.first->getParent())
+ return;
+ ScheduleData *SD = P.second;
+ if (isInSchedulingRegion(*SD))
SD->clearDependencies();
- }
+ });
+ for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
+ for_each(P.second, [&](ScheduleCopyableData *SD) {
+ if (isInSchedulingRegion(*SD))
+ SD->clearDependencies();
+ });
+ });
ReSchedule = true;
}
+ // Check if the bundle data has deps for copyable elements already. In
+ // this case need to reset deps and recalculate it.
if (Bundle && !Bundle.getBundle().empty()) {
+ if (S.areInstructionsWithCopyableElements() ||
+ !ScheduleCopyableDataMap.empty())
+ CheckIfNeedToClearDeps(Bundle);
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
<< BB->getName() << "\n");
- calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
+ calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
+ ControlDependentMembers);
+ } else if (!ControlDependentMembers.empty()) {
+ ScheduleBundle Invalid = ScheduleBundle::invalid();
+ calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
+ ControlDependentMembers);
}
if (ReSchedule) {
@@ -20120,7 +20851,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
!ReadyInsts.empty()) {
ScheduleEntity *Picked = ReadyInsts.pop_back_val();
assert(Picked->isReady() && "must be ready to schedule");
- schedule(Picked, ReadyInsts);
+ schedule(*SLP, S, EI, Picked, ReadyInsts);
if (Picked == &Bundle)
break;
}
@@ -20129,7 +20860,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+ if (S.isNonSchedulable(V))
continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
@@ -20146,11 +20877,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+ if (S.isNonSchedulable(V))
continue;
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ getScheduleCopyableData(cast<Instruction>(V));
+ if (!CopyableData.empty()) {
+ for (ScheduleCopyableData *SD : CopyableData)
+ ReadyInsts.remove(SD);
+ }
ScheduleData *BundleMember = getScheduleData(V);
- assert(BundleMember &&
+ assert((BundleMember || S.isCopyableElement(V)) &&
"no ScheduleData for bundle member (maybe not in same basic block)");
+ if (!BundleMember)
+ continue;
// Make sure we don't leave the pieces of the bundle in the ready list when
// whole bundle might not be ready.
@@ -20161,20 +20900,25 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReadyInsts.remove(B);
}
- if (!BundleMember->isScheduled())
+ if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
continue;
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
+ // A bundle member has deps calculated before it was copyable element - need
+ // to reschedule.
LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
<< " was already scheduled\n");
ReSchedule = true;
}
- ScheduleBundle &Bundle = buildBundle(VL, S);
+ ScheduleBundle &Bundle = buildBundle(VL, S, EI);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle.isReady()) {
- for (ScheduleData *BD : Bundle.getBundle()) {
+ for (ScheduleEntity *BD : Bundle.getBundle()) {
+ // Copyable data scheduling is just removed.
+ if (isa<ScheduleCopyableData>(BD))
+ continue;
if (BD->isReady()) {
ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
if (Bundles.empty()) {
@@ -20187,10 +20931,66 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
}
}
ScheduledBundlesList.pop_back();
+ SmallVector<ScheduleData *> ControlDependentMembers;
+ SmallPtrSet<Instruction *, 4> Visited;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
+ if (S.isNonSchedulable(V))
continue;
- ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
+ auto *I = cast<Instruction>(V);
+ if (S.isCopyableElement(I)) {
+ // Remove the copyable data from the scheduling region and restore
+ // previous mappings.
+ auto KV = std::make_pair(EI, I);
+ assert(ScheduleCopyableDataMap.contains(KV) &&
+ "no ScheduleCopyableData for copyable element");
+ ScheduleCopyableData *SD =
+ ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
+ ScheduleCopyableDataMapByUsers[I].remove(SD);
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, I);
+ assert(It != Op.end() && "Lane not set");
+ SmallPtrSet<Instruction *, 4> Visited;
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ if (!Visited.insert(In).second) {
+ It = find(make_range(std::next(It), Op.end()), I);
+ break;
+ }
+ ScheduleCopyableDataMapByInstUser
+ [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
+ .pop_back();
+ It = find(make_range(std::next(It), Op.end()), I);
+ } while (It != Op.end());
+ EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+ if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
+ ScheduleCopyableDataMapByUsers[I].insert(UserCD);
+ }
+ if (ScheduleCopyableDataMapByUsers[I].empty())
+ ScheduleCopyableDataMapByUsers.erase(I);
+ ScheduleCopyableDataMap.erase(KV);
+ // Need to recalculate dependencies for the actual schedule data.
+ if (ScheduleData *OpSD = getScheduleData(I)) {
+ OpSD->clearDirectDependencies();
+ if (RegionHasStackSave ||
+ !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+ ControlDependentMembers.push_back(OpSD);
+ }
+ continue;
+ }
+ ScheduledBundles.find(I)->getSecond().pop_back();
+ }
+ if (!ControlDependentMembers.empty()) {
+ ScheduleBundle Invalid = ScheduleBundle::invalid();
+ calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
+ ControlDependentMembers);
}
return std::nullopt;
}
@@ -20210,10 +21010,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V, const InstructionsState &S) {
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
- assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
- !doesNotNeedToBeScheduled(I) &&
- "phi nodes/insertelements/extractelements/extractvalues don't need to "
- "be scheduled");
if (getScheduleData(I))
return true;
if (!ScheduleStart) {
@@ -20283,14 +21079,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
// No need to allocate data for non-schedulable instructions.
- if (doesNotNeedToBeScheduled(I))
+ if (isa<PHINode>(I))
continue;
ScheduleData *SD = ScheduleDataMap.lookup(I);
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
}
- assert(!isInSchedulingRegion(SD) &&
+ assert(!isInSchedulingRegion(*SD) &&
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID, I);
@@ -20320,34 +21116,128 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
}
}
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
- bool InsertInReadyList,
- BoUpSLP *SLP) {
- SmallVector<ScheduleData *> WorkList;
- auto ProcessNode = [&](ScheduleData *BundleMember) {
+void BoUpSLP::BlockScheduling::calculateDependencies(
+ ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
+ ArrayRef<ScheduleData *> ControlDeps) {
+ SmallVector<ScheduleEntity *> WorkList;
+ auto ProcessNode = [&](ScheduleEntity *SE) {
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
+ if (CD->hasValidDependencies())
+ return;
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
+ CD->initDependencies();
+ CD->resetUnscheduledDeps();
+ const EdgeInfo &EI = CD->getEdgeInfo();
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, CD->getInst());
+ assert(It != Op.end() && "Lane not set");
+ SmallPtrSet<Instruction *, 4> Visited;
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ if (EI.UserTE->isCopyableElement(In)) {
+ // We may have not have related copyable scheduling data, if the
+ // instruction is non-schedulable.
+ if (ScheduleCopyableData *UseSD =
+ getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
+ }
+ } else if (Visited.insert(In).second) {
+ if (ScheduleData *UseSD = getScheduleData(In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
+ }
+ }
+ It = find(make_range(std::next(It), Op.end()), CD->getInst());
+ } while (It != Op.end());
+ if (CD->isReady() && CD->getDependencies() == 0 &&
+ (EI.UserTE->hasState() &&
+ (EI.UserTE->getMainOp()->getParent() !=
+ CD->getInst()->getParent() ||
+ (isa<PHINode>(EI.UserTE->getMainOp()) &&
+ (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
+ any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
+ auto *IU = dyn_cast<Instruction>(U);
+ if (!IU)
+ return true;
+ return IU->getParent() == EI.UserTE->getMainOp()->getParent();
+ })))))) {
+ // If no uses in the block - mark as having pseudo-use, which cannot
+ // be scheduled.
+ // Prevents incorrect def-use tracking between external user and
+ // actual instruction.
+ CD->incDependencies();
+ CD->incrementUnscheduledDeps(1);
+ }
+ }
+ return;
+ }
+ auto *BundleMember = cast<ScheduleData>(SE);
if (BundleMember->hasValidDependencies())
return;
LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
BundleMember->initDependencies();
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
+ SmallDenseMap<Value *, unsigned> UserToNumOps;
for (User *U : BundleMember->getInst()->users()) {
+ if (isa<PHINode>(U))
+ continue;
if (ScheduleData *UseSD = getScheduleData(U)) {
+ // The operand is a copyable element - skip.
+ unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
+ ++NumOps;
+ if (areAllOperandsReplacedByCopyableData(
+ cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
+ continue;
BundleMember->incDependencies();
if (!UseSD->isScheduled())
BundleMember->incrementUnscheduledDeps(1);
- WorkList.push_back(UseSD);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
}
}
+ for (ScheduleCopyableData *UseSD :
+ getScheduleCopyableDataUsers(BundleMember->getInst())) {
+ BundleMember->incDependencies();
+ if (!UseSD->isScheduled())
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
+ }
+ SmallPtrSet<const Instruction *, 4> Visited;
auto MakeControlDependent = [&](Instruction *I) {
+ // Do not mark control dependent twice.
+ if (!Visited.insert(I).second)
+ return;
auto *DepDest = getScheduleData(I);
assert(DepDest && "must be in schedule window");
DepDest->addControlDependency(BundleMember);
BundleMember->incDependencies();
if (!DepDest->isScheduled())
BundleMember->incrementUnscheduledDeps(1);
- WorkList.push_back(DepDest);
+ if (!DepDest->hasValidDependencies() ||
+ (InsertInReadyList && DepDest->isReady()))
+ WorkList.push_back(DepDest);
};
// Any instruction which isn't safe to speculate at the beginning of the
@@ -20426,7 +21316,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
for (ScheduleData *DepDest = NextLoadStore; DepDest;
DepDest = DepDest->getNextLoadStore()) {
- assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
+ assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -20449,7 +21339,9 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
BundleMember->incDependencies();
if (!DepDest->isScheduled())
BundleMember->incrementUnscheduledDeps(1);
- WorkList.push_back(DepDest);
+ if (!DepDest->hasValidDependencies() ||
+ (InsertInReadyList && DepDest->isReady()))
+ WorkList.push_back(DepDest);
}
// Example, explaining the loop break condition: Let's assume our
@@ -20471,13 +21363,25 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
}
};
- WorkList.push_back(Bundle.getBundle().front());
+ assert((Bundle || !ControlDeps.empty()) &&
+ "expected at least one instruction to schedule");
+ if (Bundle)
+ WorkList.push_back(Bundle.getBundle().front());
+ WorkList.append(ControlDeps.begin(), ControlDeps.end());
SmallPtrSet<ScheduleBundle *, 16> Visited;
while (!WorkList.empty()) {
- ScheduleData *SD = WorkList.pop_back_val();
- ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst());
+ ScheduleEntity *SD = WorkList.pop_back_val();
+ SmallVector<ScheduleBundle *, 1> CopyableBundle;
+ ArrayRef<ScheduleBundle *> Bundles;
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
+ CopyableBundle.push_back(&CD->getBundle());
+ Bundles = CopyableBundle;
+ } else {
+ Bundles = getScheduleBundles(SD->getInst());
+ }
if (Bundles.empty()) {
- ProcessNode(SD);
+ if (!SD->hasValidDependencies())
+ ProcessNode(SD);
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.insert(SD);
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
@@ -20485,7 +21389,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
continue;
}
for (ScheduleBundle *Bundle : Bundles) {
- if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies())
+ if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
continue;
assert(isInSchedulingRegion(*Bundle) &&
"ScheduleData not in scheduling region");
@@ -20508,23 +21412,40 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
- for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- if (ScheduleData *SD = getScheduleData(I)) {
- assert(isInSchedulingRegion(SD) &&
- "ScheduleData not in scheduling region");
+ for_each(ScheduleDataMap, [&](auto &P) {
+ if (BB != P.first->getParent())
+ return;
+ ScheduleData *SD = P.second;
+ if (isInSchedulingRegion(*SD)) {
SD->setScheduled(/*Scheduled=*/false);
SD->resetUnscheduledDeps();
}
- for (ScheduleBundle *Bundle : getScheduleBundles(I)) {
- assert(isInSchedulingRegion(*Bundle) &&
- "ScheduleBundle not in scheduling region");
- Bundle->setScheduled(/*Scheduled=*/false);
+ });
+ for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
+ for_each(P.second, [&](ScheduleCopyableData *SD) {
+ if (isInSchedulingRegion(*SD)) {
+ SD->setScheduled(/*Scheduled=*/false);
+ SD->resetUnscheduledDeps();
+ }
+ });
+ });
+ for_each(ScheduledBundles, [&](auto &P) {
+ for_each(P.second, [&](ScheduleBundle *Bundle) {
+ if (isInSchedulingRegion(*Bundle))
+ Bundle->setScheduled(/*Scheduled=*/false);
+ });
+ });
+ // Reset schedule data for copyable elements.
+ for (auto &P : ScheduleCopyableDataMap) {
+ if (isInSchedulingRegion(*P.second)) {
+ P.second->setScheduled(/*Scheduled=*/false);
+ P.second->resetUnscheduledDeps();
}
}
ReadyInsts.clear();
}
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
@@ -20562,15 +21483,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!Bundle->hasValidDependencies())
BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
}
+ SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
+ for (ScheduleCopyableData *SD : reverse(SDs)) {
+ ScheduleBundle &Bundle = SD->getBundle();
+ Bundle.setSchedulingPriority(Idx++);
+ if (!Bundle.hasValidDependencies())
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ }
continue;
}
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ BS->getScheduleCopyableDataUsers(I);
if (ScheduleData *SD = BS->getScheduleData(I)) {
[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
- SDTEs.front()->doesNotNeedToSchedule()) &&
+ SDTEs.front()->doesNotNeedToSchedule() ||
+ doesNotNeedToBeScheduled(I)) &&
"scheduler and vectorizer bundle mismatch");
SD->setSchedulingPriority(Idx++);
- continue;
+ if (!SD->hasValidDependencies() &&
+ (!CopyableData.empty() ||
+ any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
+ assert(TE->isGather() && "expected gather node");
+ return TE->hasState() && TE->hasCopyableElements() &&
+ TE->isCopyableElement(I);
+ }))) {
+ // Need to calculate deps for these nodes to correctly handle copyable
+ // dependencies, even if they were cancelled.
+ // If copyables bundle was cancelled, the deps are cleared and need to
+ // recalculate them.
+ ScheduleBundle Bundle;
+ Bundle.add(SD);
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ }
+ }
+ for (ScheduleCopyableData *SD : reverse(CopyableData)) {
+ ScheduleBundle &Bundle = SD->getBundle();
+ Bundle.setSchedulingPriority(Idx++);
+ if (!Bundle.hasValidDependencies())
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
}
}
BS->initialFillReadyList(ReadyInsts);
@@ -20586,9 +21537,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
- for (const ScheduleData *BundleMember : Bundle->getBundle()) {
+ for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
Instruction *PickedInst = BundleMember->getInst();
- if (!Scheduled.insert(PickedInst).second)
+ // If copyable must be schedule as part of something else, skip it.
+ bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
+ if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
+ (!IsCopyable && !Scheduled.insert(PickedInst).second))
continue;
if (PickedInst->getNextNode() != LastScheduledInst)
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
@@ -20603,7 +21557,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
}
- BS->schedule(Picked, ReadyInsts);
+ auto Invalid = InstructionsState::invalid();
+ BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
}
// Check that we didn't break any of our invariants.
@@ -20965,9 +21920,11 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
+ APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (E.isCopyableElement(V))
+ return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
- APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
SimplifyQuery(*DL));
@@ -22729,21 +23686,11 @@ public:
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI,
- DominatorTree &DT, TargetTransformInfo &TTI) {
+ const TargetLibraryInfo &TLI) {
RdxKind = HorizontalReduction::getRdxKind(Root);
if (!isVectorizable(RdxKind, Root))
return false;
- // FMA reduction root - skip.
- auto CheckForFMA = [&](Instruction *I) {
- return RdxKind == RecurKind::FAdd &&
- canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
- .isValid();
- };
- if (CheckForFMA(Root))
- return false;
-
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
Type *Ty = Root->getType();
@@ -22781,7 +23728,7 @@ public:
// Also, do not try to reduce const values, if the operation is not
// foldable.
if (!EdgeInst || Level > RecursionMaxDepth ||
- getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
+ getRdxKind(EdgeInst) != RdxKind ||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
!isVectorizable(RdxKind, EdgeInst) ||
@@ -23530,7 +24477,7 @@ public:
// correct, replace internal uses with undef, and mark for eventual
// deletion.
#ifndef NDEBUG
- SmallSet<Value *, 4> IgnoreSet;
+ SmallPtrSet<Value *, 4> IgnoreSet;
for (ArrayRef<Value *> RdxOps : ReductionOps)
IgnoreSet.insert_range(RdxOps);
#endif
@@ -23843,6 +24790,8 @@ private:
case RecurKind::FMinimum:
// res = vv
break;
+ case RecurKind::Sub:
+ case RecurKind::AddChainWithSubs:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
@@ -23982,6 +24931,8 @@ private:
case RecurKind::FMinimum:
// res = vv
return VectorizedValue;
+ case RecurKind::Sub:
+ case RecurKind::AddChainWithSubs:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
@@ -24086,6 +25037,8 @@ private:
auto *Scale = ConstantVector::get(Vals);
return Builder.CreateFMul(VectorizedValue, Scale);
}
+ case RecurKind::Sub:
+ case RecurKind::AddChainWithSubs:
case RecurKind::Mul:
case RecurKind::FMul:
case RecurKind::FMulAdd:
@@ -24356,7 +25309,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
if (!isReductionCandidate(Inst))
return nullptr;
HorizontalReduction HorRdx;
- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f32d57f..e414c12 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
case Instruction::Opcode::FPToUI:
case Instruction::Opcode::FPToSI:
case Instruction::Opcode::FPExt:
+ case Instruction::Opcode::PtrToAddr:
case Instruction::Opcode::PtrToInt:
case Instruction::Opcode::IntToPtr:
case Instruction::Opcode::SIToFP:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 73babcc..f972efa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -246,8 +246,7 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
IRBuilderBase &Builder, VPlan *Plan,
Loop *CurrentParentLoop, Type *CanonicalIVTy)
: TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
- CurrentParentLoop(CurrentParentLoop), TypeAnalysis(CanonicalIVTy),
- VPDT(*Plan) {}
+ CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
if (Def->isLiveIn())
@@ -296,27 +295,11 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
if (hasVectorValue(Def))
return Data.VPV2Vector[Def];
- auto GetBroadcastInstrs = [this, Def](Value *V) {
- bool SafeToHoist =
- !Def->hasDefiningRecipe() ||
- VPDT.properlyDominates(Def->getDefiningRecipe()->getParent(),
- Plan->getVectorPreheader());
-
+ auto GetBroadcastInstrs = [this](Value *V) {
if (VF.isScalar())
return V;
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist) {
- BasicBlock *LoopVectorPreHeader =
- CFG.VPBB2IRBB[Plan->getVectorPreheader()];
- if (LoopVectorPreHeader)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- }
-
- // Place the code for broadcasting invariant variables in the new preheader.
// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
return Shuf;
};
@@ -372,6 +355,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
set(Def, VectorValue);
} else {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ assert(isa<VPInstruction>(Def) &&
+ "Explicit BuildVector recipes must have"
+ "handled packing for non-VPInstructions.");
// Initialize packing with insertelements to start from poison.
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
@@ -951,28 +937,6 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
- if (!VectorTripCount.getUnderlyingValue())
- VectorTripCount.setUnderlyingValue(VectorTripCountV);
- else
- assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
- "VectorTripCount set earlier must much VectorTripCountV");
-
- IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
- Type *TCTy = VectorTripCountV->getType();
- // FIXME: Model VF * UF computation completely in VPlan.
- unsigned UF = getUF();
- if (VF.getNumUsers()) {
- Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
- VF.setUnderlyingValue(RuntimeVF);
- VFxUF.setUnderlyingValue(
- UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF))
- : RuntimeVF);
- } else {
- VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
- }
-}
-
VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
auto Iter = find_if(getExitBlocks(), [IRBB](const VPIRBasicBlock *VPIRBB) {
return VPIRBB->getIRBasicBlock() == IRBB;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42cdd5..46e55be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1019,7 +1019,11 @@ public:
/// The lane specifies an index into a vector formed by combining all vector
/// operands (all operands after the first one).
ExtractLane,
-
+ /// Explicit user for the resume phi of the canonical induction in the main
+ /// VPlan, used by the epilogue vector loop.
+ ResumeForEpilogue,
+ /// Returns the value for vscale.
+ VScale,
};
private:
@@ -1167,6 +1171,7 @@ public:
switch (VPI->getOpcode()) {
case VPInstruction::WideIVStep:
case VPInstruction::StepVector:
+ case VPInstruction::VScale:
return true;
default:
return false;
@@ -1227,6 +1232,31 @@ public:
return getAsRecipe()->getNumOperands();
}
+ /// Returns an interator range over the incoming values.
+ VPUser::const_operand_range incoming_values() const {
+ return make_range(getAsRecipe()->op_begin(),
+ getAsRecipe()->op_begin() + getNumIncoming());
+ }
+
+ using const_incoming_blocks_range = iterator_range<mapped_iterator<
+ detail::index_iterator, std::function<const VPBasicBlock *(size_t)>>>;
+
+ /// Returns an iterator range over the incoming blocks.
+ const_incoming_blocks_range incoming_blocks() const {
+ std::function<const VPBasicBlock *(size_t)> GetBlock = [this](size_t Idx) {
+ return getIncomingBlock(Idx);
+ };
+ return map_range(index_range(0, getNumIncoming()), GetBlock);
+ }
+
+ /// Returns an iterator range over pairs of incoming values and corresponding
+ /// incoming blocks.
+ detail::zippy<llvm::detail::zip_first, VPUser::const_operand_range,
+ const_incoming_blocks_range>
+ incoming_values_and_blocks() const {
+ return zip_equal(incoming_values(), incoming_blocks());
+ }
+
/// Removes the incoming value for \p IncomingBlock, which must be a
/// predecessor.
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const;
@@ -2298,6 +2328,11 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
+ /// Returns the number of incoming values, also number of incoming blocks.
+ /// Note that at the moment, VPWidenPointerInductionRecipe only has a single
+ /// incoming value, its start value.
+ unsigned getNumIncoming() const override { return 2; }
+
/// Returns the recurrence kind of the reduction.
RecurKind getRecurrenceKind() const { return Kind; }
@@ -2408,11 +2443,11 @@ public:
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i)) {
- if (I->getType()->isVoidTy())
+ for (unsigned I = 0; I < IG->getFactor(); ++I)
+ if (Instruction *Inst = IG->getMember(I)) {
+ if (Inst->getType()->isVoidTy())
continue;
- new VPValue(I, this);
+ new VPValue(Inst, this);
}
for (auto *SV : StoredValues)
@@ -3076,10 +3111,11 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
/// using the address to load from, the explicit vector length and an optional
/// mask.
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
- VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+ VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
+ VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L, L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+ L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3157,11 +3193,11 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
/// using the value to store, the address to store to, the explicit vector
/// length and an optional mask.
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
- VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+ VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL,
+ VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
- {S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S,
- S.getDebugLoc()) {
+ {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
+ S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}
@@ -3968,9 +4004,6 @@ public:
VPBB->setPlan(this);
}
- /// Prepare the plan for execution, setting up the required live-in values.
- void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
-
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4c3cdda..b39231f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -21,8 +21,7 @@ using namespace llvm;
#define DEBUG_TYPE "vplan"
-VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan)
- : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) {
+VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) {
if (auto LoopRegion = Plan.getVectorLoopRegion()) {
if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
&LoopRegion->getEntryBasicBlock()->front())) {
@@ -74,6 +73,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case Instruction::ExtractElement:
case Instruction::Freeze:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ResumeForEpilogue:
return inferScalarType(R->getOperand(0));
case Instruction::Select: {
Type *ResTy = inferScalarType(R->getOperand(1));
@@ -500,7 +500,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPTypeAnalysis TypeInfo(Plan);
const auto &TTICapture = TTI;
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index cd86d27..c6c4369 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -58,9 +58,6 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
public:
- VPTypeAnalysis(Type *CanonicalIVTy)
- : CanonicalIVTy(CanonicalIVTy), Ctx(CanonicalIVTy->getContext()) {}
-
VPTypeAnalysis(const VPlan &Plan);
/// Infer the type of \p V. Returns the scalar type of \p V.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7e8eff31..b231a84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -336,12 +336,6 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
return std::move(Plan);
}
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
- LoopInfo &LI) {
- PlainCFGBuilder Builder(TheLoop, &LI);
- return Builder.buildPlainCFG();
-}
-
/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
/// has exactly 2 predecessors (preheader and latch), where the block
/// dominates the latch and the preheader dominates the block. If it is a
@@ -457,10 +451,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
LatchDL);
}
-void VPlanTransforms::prepareForVectorization(
- VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
- bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop,
- DebugLoc IVDL, bool HasUncountableEarlyExit, VFRange &Range) {
+static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
+ PredicatedScalarEvolution &PSE, Loop *TheLoop) {
VPDominatorTree VPDT;
VPDT.recalculate(Plan);
@@ -486,12 +478,54 @@ void VPlanTransforms::prepareForVectorization(
addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL);
- [[maybe_unused]] bool HandledUncountableEarlyExit = false;
+ // Create SCEV and VPValue for the trip count.
+ // We use the symbolic max backedge-taken-count, which works also when
+ // vectorizing loops with uncountable early exits.
+ const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
+ "Invalid backedge-taken count");
+ ScalarEvolution &SE = *PSE.getSE();
+ const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
+ InductionTy, TheLoop);
+ Plan.setTripCount(
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
+
+ VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
+
+ // The connection order corresponds to the operands of the conditional branch,
+ // with the middle block already connected to the exit block.
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ // Also connect the entry block to the scalar preheader.
+ // TODO: Also introduce a branch recipe together with the minimum trip count
+ // check.
+ VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
+ Plan.getEntry()->swapSuccessors();
+}
+
+std::unique_ptr<VPlan>
+VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
+ DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
+ PlainCFGBuilder Builder(TheLoop, &LI);
+ std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
+ addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
+ return VPlan0;
+}
+
+void VPlanTransforms::handleEarlyExits(VPlan &Plan,
+ bool HasUncountableEarlyExit,
+ VFRange &Range) {
+ auto *MiddleVPBB = cast<VPBasicBlock>(
+ Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
+ auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
+ VPBlockBase *HeaderVPB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[1]);
+
// Disconnect all early exits from the loop leaving it with a single exit from
// the latch. Early exits that are countable are left for a scalar epilog. The
// condition of uncountable early exits (currently at most one is supported)
// is fused into the latch exit, and used to branch from middle block to the
// early exit destination.
+ [[maybe_unused]] bool HandledUncountableEarlyExit = false;
for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
if (Pred == MiddleVPBB)
@@ -500,7 +534,8 @@ void VPlanTransforms::prepareForVectorization(
assert(!HandledUncountableEarlyExit &&
"can handle exactly one uncountable early exit");
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- HeaderVPBB, LatchVPBB, Range);
+ cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
+ Range);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : EB->phis())
@@ -513,36 +548,18 @@ void VPlanTransforms::prepareForVectorization(
assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) &&
"missed an uncountable exit that must be handled");
+}
- // Create SCEV and VPValue for the trip count.
- // We use the symbolic max backedge-taken-count, which works also when
- // vectorizing loops with uncountable early exits.
- const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
- assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
- "Invalid loop count");
- ScalarEvolution &SE = *PSE.getSE();
- const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
- InductionTy, TheLoop);
- Plan.setTripCount(
- vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
-
- VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
- VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
-
- // The connection order corresponds to the operands of the conditional branch,
- // with the middle block already connected to the exit block.
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
- // Also connect the entry block to the scalar preheader.
- // TODO: Also introduce a branch recipe together with the minimum trip count
- // check.
- VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
- Plan.getEntry()->swapSuccessors();
-
+void VPlanTransforms::addMiddleCheck(VPlan &Plan,
+ bool RequiresScalarEpilogueCheck,
+ bool TailFolded) {
+ auto *MiddleVPBB = cast<VPBasicBlock>(
+ Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
// If MiddleVPBB has a single successor then the original loop does not exit
// via the latch and the single successor must be the scalar preheader.
// There's no need to add a runtime check to MiddleVPBB.
if (MiddleVPBB->getNumSuccessors() == 1) {
- assert(MiddleVPBB->getSingleSuccessor() == ScalarPH &&
+ assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() &&
"must have ScalarPH as single successor");
return;
}
@@ -564,6 +581,7 @@ void VPlanTransforms::prepareForVectorization(
// the corresponding compare because they may have ended up with different
// line numbers and we want to avoid awkward line stepping while debugging.
// E.g., if the compare has got a line number inside the loop.
+ auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
VPBuilder Builder(MiddleVPBB);
VPValue *Cmp;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 4154720c..5ad2ac6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -351,10 +351,10 @@ struct VPCostContext {
TargetTransformInfo::TargetCostKind CostKind;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
- Type *CanIVTy, LoopVectorizationCostModel &CM,
+ const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind)
- : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
- CM(CM), CostKind(CostKind) {}
+ : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
+ CostKind(CostKind) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 8818843..9f036fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -200,15 +200,11 @@ template <typename Ops_t, unsigned Opcode, bool Commutative,
struct Recipe_match {
Ops_t Ops;
- Recipe_match() : Ops() {
- static_assert(std::tuple_size<Ops_t>::value == 0 &&
- "constructor can only be used with zero operands");
- }
- Recipe_match(Ops_t Ops) : Ops(Ops) {}
- template <typename A_t, typename B_t>
- Recipe_match(A_t A, B_t B) : Ops({A, B}) {
- static_assert(std::tuple_size<Ops_t>::value == 2 &&
- "constructor can only be used for binary matcher");
+ template <typename... OpTy> Recipe_match(OpTy... Ops) : Ops(Ops...) {
+ static_assert(std::tuple_size<Ops_t>::value == sizeof...(Ops) &&
+ "number of operands in constructor doesn't match Ops_t");
+ static_assert((!Commutative || std::tuple_size<Ops_t>::value == 2) &&
+ "only binary ops can be commutative");
}
bool match(const VPValue *V) const {
@@ -254,7 +250,6 @@ private:
// Check for recipes that do not have opcodes.
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
- std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
return DefR;
@@ -270,195 +265,128 @@ private:
}
};
-template <unsigned Opcode, typename... RecipeTys>
-using ZeroOpRecipe_match =
- Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
-using UnaryRecipe_match =
- Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode>
-using UnaryVPInstruction_match =
- UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_match =
+ Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ false,
+ VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe,
+ VPInstruction, VPWidenSelectRecipe>;
-template <unsigned Opcode>
-using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_commutative_match =
+ Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ true,
+ VPWidenRecipe, VPReplicateRecipe, VPInstruction>;
-template <typename Op0_t, unsigned Opcode>
-using AllUnaryRecipe_match =
- UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using VPInstruction_match = Recipe_match<std::tuple<OpTys...>, Opcode,
+ /*Commutative*/ false, VPInstruction>;
-template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutative,
- typename... RecipeTys>
-using BinaryRecipe_match =
- Recipe_match<std::tuple<Op0_t, Op1_t>, Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
-using BinaryVPInstruction_match =
- BinaryRecipe_match<Op0_t, Op1_t, Opcode, /*Commutative*/ false,
- VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode,
- bool Commutative, typename... RecipeTys>
-using TernaryRecipe_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>,
- Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using TernaryVPInstruction_match =
- TernaryRecipe_match<Op0_t, Op1_t, Op2_t, Opcode, /*Commutative*/ false,
- VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode,
- bool Commutative = false>
-using AllBinaryRecipe_match =
- BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
- VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+inline VPInstruction_match<Opcode, OpTys...>
+m_VPInstruction(const OpTys &...Ops) {
+ return VPInstruction_match<Opcode, OpTys...>(Ops...);
+}
/// BuildVector is matches only its opcode, w/o matching its operands as the
/// number of operands is not fixed.
-inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
- return ZeroOpVPInstruction_match<VPInstruction::BuildVector>();
-}
-
-template <unsigned Opcode, typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Opcode>
-m_VPInstruction(const Op0_t &Op0) {
- return UnaryVPInstruction_match<Op0_t, Opcode>(Op0);
-}
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) {
- return BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+inline VPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
+ return m_VPInstruction<VPInstruction::BuildVector>();
}
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t>
-inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
- return TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>(
- {Op0, Op1, Op2});
-}
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
- unsigned Opcode, bool Commutative, typename... RecipeTys>
-using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>,
- Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
- unsigned Opcode>
-using VPInstruction4Op_match =
- Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false,
- VPInstruction>;
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t,
- typename Op3_t>
-inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2,
- const Op3_t &Op3) {
- return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>(
- {Op0, Op1, Op2, Op3});
-}
template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze>
+inline VPInstruction_match<Instruction::Freeze, Op0_t>
m_Freeze(const Op0_t &Op0) {
return m_VPInstruction<Instruction::Freeze>(Op0);
}
template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::BranchOnCond>
+inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
m_BranchOnCond(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
}
template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::Broadcast>
+inline VPInstruction_match<VPInstruction::Broadcast, Op0_t>
m_Broadcast(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::Broadcast>(Op0);
}
template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
}
template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
+inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
}
template <unsigned Opcode, typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) {
- return AllUnaryRecipe_match<Op0_t, Opcode>(Op0);
+inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
+ return AllRecipe_match<Opcode, Op0_t>(Op0);
}
template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc>
-m_Trunc(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::Trunc, Op0_t> m_Trunc(const Op0_t &Op0) {
return m_Unary<Instruction::Trunc, Op0_t>(Op0);
}
template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::ZExt, Op0_t> m_ZExt(const Op0_t &Op0) {
return m_Unary<Instruction::ZExt, Op0_t>(Op0);
}
template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::SExt, Op0_t> m_SExt(const Op0_t &Op0) {
return m_Unary<Instruction::SExt, Op0_t>(Op0);
}
template <typename Op0_t>
-inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>,
- AllUnaryRecipe_match<Op0_t, Instruction::SExt>>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>,
+ AllRecipe_match<Instruction::SExt, Op0_t>>
m_ZExtOrSExt(const Op0_t &Op0) {
return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
}
-template <unsigned Opcode, typename Op0_t, typename Op1_t,
- bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>
-m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
- return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
+ const Op1_t &Op1) {
+ return AllRecipe_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
}
template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>
+inline AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>
m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
- return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
+ return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
}
template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
-m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+inline AllRecipe_match<Instruction::Mul, Op0_t, Op1_t> m_Mul(const Op0_t &Op0,
+ const Op1_t &Op1) {
return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}
template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul,
- /* Commutative =*/true>
+inline AllRecipe_commutative_match<Instruction::Mul, Op0_t, Op1_t>
m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
- return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1);
+ return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}
/// Match a binary OR operation. Note that while conceptually the operands can
/// be matched commutatively, \p Commutative defaults to false in line with the
/// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
/// version of the matcher.
-template <typename Op0_t, typename Op1_t, bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, Commutative>
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Or, Op0_t, Op1_t>
m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
- return m_Binary<Instruction::Or, Op0_t, Op1_t, Commutative>(Op0, Op1);
+ return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
}
template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or,
- /*Commutative*/ true>
+inline AllRecipe_commutative_match<Instruction::Or, Op0_t, Op1_t>
m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
- return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
+ return m_c_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
}
/// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison
@@ -523,9 +451,9 @@ m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
template <typename Op0_t, typename Op1_t>
using GEPLikeRecipe_match =
- BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
- VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe,
- VPInstruction>;
+ Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr,
+ /*Commutative*/ false, VPWidenRecipe, VPReplicateRecipe,
+ VPWidenGEPRecipe, VPInstruction>;
template <typename Op0_t, typename Op1_t>
inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
@@ -533,22 +461,17 @@ inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
}
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using AllTernaryRecipe_match =
- Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false,
- VPReplicateRecipe, VPInstruction, VPWidenSelectRecipe>;
-
template <typename Op0_t, typename Op1_t, typename Op2_t>
-inline AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>
m_Select(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
- return AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>(
+ return AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>(
{Op0, Op1, Op2});
}
template <typename Op0_t>
-inline match_combine_or<UnaryVPInstruction_match<Op0_t, VPInstruction::Not>,
- AllBinaryRecipe_match<int_pred_ty<is_all_ones>, Op0_t,
- Instruction::Xor, true>>
+inline match_combine_or<VPInstruction_match<VPInstruction::Not, Op0_t>,
+ AllRecipe_commutative_match<
+ Instruction::Xor, int_pred_ty<is_all_ones>, Op0_t>>
m_Not(const Op0_t &Op0) {
return m_CombineOr(m_VPInstruction<VPInstruction::Not>(Op0),
m_c_Binary<Instruction::Xor>(m_AllOnes(), Op0));
@@ -556,9 +479,8 @@ m_Not(const Op0_t &Op0) {
template <typename Op0_t, typename Op1_t>
inline match_combine_or<
- BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>,
- AllTernaryRecipe_match<Op0_t, Op1_t, specific_intval<1>,
- Instruction::Select>>
+ VPInstruction_match<VPInstruction::LogicalAnd, Op0_t, Op1_t>,
+ AllRecipe_match<Instruction::Select, Op0_t, Op1_t, specific_intval<1>>>
m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
return m_CombineOr(
m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1),
@@ -566,15 +488,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
}
template <typename Op0_t, typename Op1_t>
-inline AllTernaryRecipe_match<Op0_t, specific_intval<1>, Op1_t,
- Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, specific_intval<1>, Op1_t>
m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
return m_Select(Op0, m_True(), Op1);
}
template <typename Op0_t, typename Op1_t, typename Op2_t>
-using VPScalarIVSteps_match =
- TernaryRecipe_match<Op0_t, Op1_t, Op2_t, 0, false, VPScalarIVStepsRecipe>;
+using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0,
+ false, VPScalarIVStepsRecipe>;
template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPScalarIVSteps_match<Op0_t, Op1_t, Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 862b930..cdadc33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -238,14 +238,11 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
// optimizations will clean it up.
SmallVector<VPValue *, 2> OperandsWithMask;
- unsigned NumIncoming = PhiR->getNumIncoming();
- for (unsigned In = 0; In < NumIncoming; In++) {
- const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
- OperandsWithMask.push_back(PhiR->getIncomingValue(In));
- VPValue *EdgeMask = getEdgeMask(Pred, VPBB);
+ for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
+ OperandsWithMask.push_back(InVPV);
+ VPValue *EdgeMask = getEdgeMask(InVPBB, VPBB);
if (!EdgeMask) {
- assert(In == 0 && "Both null and non-null edge masks found");
- assert(all_equal(PhiR->operands()) &&
+ assert(all_equal(PhiR->incoming_values()) &&
"Distinct incoming values with one having a full mask");
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e971ba1..7ca9b23 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -452,6 +452,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
switch (Opcode) {
case VPInstruction::StepVector:
+ case VPInstruction::VScale:
return 0;
case Instruction::Alloca:
case Instruction::ExtractValue:
@@ -459,6 +460,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::Load:
case VPInstruction::AnyOf:
case VPInstruction::BranchOnCond:
+ case VPInstruction::BuildStructVector:
+ case VPInstruction::BuildVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
@@ -517,6 +520,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::AnyOf:
+ case VPInstruction::Not:
return true;
default:
return false;
@@ -569,7 +573,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
switch (getOpcode()) {
case VPInstruction::Not: {
- Value *A = State.get(getOperand(0));
+ bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+ Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
return Builder.CreateNot(A, Name);
}
case Instruction::ExtractElement: {
@@ -810,10 +815,18 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *RdxPart = RdxParts[Part];
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- else
- ReducedPartRdx = Builder.CreateBinOp(
- (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
- RdxPart, ReducedPartRdx, "bin.rdx");
+ else {
+ Instruction::BinaryOps Opcode;
+ // For sub-recurrences, each UF's reduction variable is already
+ // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
+ if (RK == RecurKind::Sub)
+ Opcode = Instruction::Add;
+ else
+ Opcode =
+ (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
+ ReducedPartRdx =
+ Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
+ }
}
}
@@ -922,6 +935,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Res;
}
+ case VPInstruction::ResumeForEpilogue:
+ return State.get(getOperand(0), true);
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -998,6 +1013,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastElement: {
+ // Add on the cost of extracting the element.
+ auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
+ VecTy, Ctx.CostKind, 0);
+ }
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
@@ -1027,6 +1048,8 @@ bool VPInstruction::isSingleScalar() const {
switch (getOpcode()) {
case Instruction::PHI:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::ResumeForEpilogue:
+ case VPInstruction::VScale:
return true;
default:
return isScalarCast();
@@ -1076,6 +1099,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::PHI:
case VPInstruction::AnyOf:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
@@ -1093,6 +1117,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::VScale:
return false;
default:
return true;
@@ -1116,6 +1141,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case Instruction::Select:
case Instruction::Or:
case Instruction::Freeze:
+ case VPInstruction::Not:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
@@ -1251,6 +1277,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ReductionStartVector:
O << "reduction-start-vector";
break;
+ case VPInstruction::ResumeForEpilogue:
+ O << "resume-for-epilogue";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1281,6 +1310,12 @@ void VPInstructionWithType::execute(VPTransformState &State) {
State.set(this, StepVector);
break;
}
+ case VPInstruction::VScale: {
+ Value *VScale = State.Builder.CreateVScale(ResultTy);
+ State.set(this, VScale, true);
+ break;
+ }
+
default:
llvm_unreachable("opcode not implemented yet");
}
@@ -1301,6 +1336,9 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::StepVector:
O << "step-vector " << *ResultTy;
break;
+ case VPInstruction::VScale:
+ O << "vscale " << *ResultTy;
+ break;
default:
assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
O << Instruction::getOpcodeName(getOpcode()) << " ";
@@ -1434,12 +1472,12 @@ void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
if (getNumOperands() != 0) {
O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
- interleaveComma(
- enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
- Op.value()->printAsOperand(O, SlotTracker);
- O << " from ";
- getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
- });
+ interleaveComma(incoming_values_and_blocks(), O,
+ [&O, &SlotTracker](auto Op) {
+ std::get<0>(Op)->printAsOperand(O, SlotTracker);
+ O << " from ";
+ std::get<1>(Op)->printAsOperand(O);
+ });
O << ")";
}
}
@@ -2934,7 +2972,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// transform, avoid computing their cost multiple times for now.
Ctx.SkipCostComputation.insert(UI);
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ResultTy = Ctx.Types.inferScalarType(this);
switch (UI->getOpcode()) {
case Instruction::GetElementPtr:
@@ -2943,6 +2980,24 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// is scalarized or not. Therefore, we handle GEPs with the memory
// instruction cost.
return 0;
+ case Instruction::Call: {
+ if (!isSingleScalar()) {
+ // TODO: Handle remaining call costs here as well.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+ break;
+ }
+
+ auto *CalledFn =
+ cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
+ if (CalledFn->isIntrinsic())
+ break;
+
+ SmallVector<Type *, 4> Tys;
+ for (VPValue *ArgOp : drop_end(operands()))
+ Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+ return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ }
case Instruction::Add:
case Instruction::Sub:
case Instruction::FAdd:
@@ -2960,7 +3015,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
auto Op2Info = Ctx.getOperandInfo(getOperand(1));
SmallVector<const Value *, 4> Operands(UI->operand_values());
return Ctx.TTI.getArithmeticInstrCost(
- UI->getOpcode(), ResultTy, CostKind,
+ UI->getOpcode(), ResultTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
Op2Info, Operands, UI, &Ctx.TLI) *
(isSingleScalar() ? 1 : VF.getFixedValue());
@@ -3097,9 +3152,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ Type *PtrTy = toVectorTy(Ptr->getType(), VF);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- return Ctx.TTI.getAddressComputationCost(Ty) +
+ return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
+ Ctx.CostKind) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
Ctx.CostKind, &Ingredient);
}
@@ -3445,6 +3502,8 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
+ assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+ "Masking gaps for scalable vectors is not yet supported.");
const InterleaveGroup<Instruction> *Group = IG;
Instruction *Instr = Group->getInsertPos();
@@ -3562,8 +3621,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
- assert((!MaskForGaps || !State.VF.isScalable()) &&
- "masking gaps for scalable vectors is not yet supported.");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1c8bd6c..cff43c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -517,10 +517,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
// everything WidenNewIV's users need. That is, WidenOriginalIV will
// generate a vector phi or all users of WidenNewIV demand the first lane
// only.
- if (any_of(WidenOriginalIV->users(),
- [WidenOriginalIV](VPUser *U) {
- return !U->usesScalars(WidenOriginalIV);
- }) ||
+ if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
vputils::onlyFirstLaneUsed(WidenNewIV)) {
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
WidenNewIV->eraseFromParent();
@@ -553,8 +550,22 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (isDeadRecipe(R))
+ if (isDeadRecipe(R)) {
R.eraseFromParent();
+ continue;
+ }
+
+ // Check if R is a dead VPPhi <-> update cycle and remove it.
+ auto *PhiR = dyn_cast<VPPhi>(&R);
+ if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
+ continue;
+ VPValue *Incoming = PhiR->getOperand(1);
+ if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
+ Incoming->getNumUsers() != 1)
+ continue;
+ PhiR->replaceAllUsesWith(PhiR->getOperand(0));
+ PhiR->eraseFromParent();
+ Incoming->getDefiningRecipe()->eraseFromParent();
}
}
}
@@ -571,8 +582,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
// Truncate base induction if needed.
- Type *CanonicalIVType = CanonicalIV->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
+ VPTypeAnalysis TypeInfo(Plan);
Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
if (TruncI) {
Type *TruncTy = TruncI->getType();
@@ -868,7 +878,7 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
void VPlanTransforms::optimizeInductionExitUsers(
VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPTypeAnalysis TypeInfo(Plan);
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
for (VPRecipeBase &R : ExitVPBB->phis()) {
auto *ExitIRI = cast<VPIRPhi>(&R);
@@ -970,10 +980,11 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
Ops[1],
cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
- case Instruction::InsertElement:
- return Folder.FoldInsertElement(Ops[0], Ops[1], Ops[2]);
+ // An extract of a live-in is an extract of a broadcast, so return the
+ // broadcasted element.
case Instruction::ExtractElement:
- return Folder.FoldExtractElement(Ops[0], Ops[1]);
+ assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
+ return Ops[0];
}
return nullptr;
}
@@ -1041,7 +1052,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
#ifndef NDEBUG
// Verify that the cached type info is for both A and its users is still
// accurate by comparing it to freshly computed types.
- VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
+ VPTypeAnalysis TypeInfo2(*Plan);
assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
for (VPUser *U : A->users()) {
auto *R = cast<VPRecipeBase>(U);
@@ -1202,9 +1213,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(Def,
- m_VPInstruction<VPInstruction::ExtractLastElement>(
- m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) {
+ if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
+ m_Broadcast(m_VPValue(A))))) {
Def->replaceAllUsesWith(A);
return;
}
@@ -1218,10 +1228,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
}
}
-void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
+void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
- VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+ VPTypeAnalysis TypeInfo(Plan);
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
simplifyRecipe(R, TypeInfo);
@@ -1251,9 +1261,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
// scalar results used. In the latter case, we would introduce extra
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
- any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
- return !U->usesScalars(RepOrWidenR);
- }))
+ !vputils::onlyScalarValuesUsed(RepOrWidenR))
continue;
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
@@ -1485,7 +1493,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
// the region, otherwise replace the terminator controlling the latch with
// (BranchOnCond true).
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
- auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
if (all_of(Header->phis(),
IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPPhi>)) {
@@ -1505,7 +1512,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPBlockUtils::connectBlocks(Preheader, Header);
VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
- VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
+ VPlanTransforms::simplifyRecipes(Plan);
} else {
// The vector region contains header phis for which we cannot remove the
// loop region yet.
@@ -1748,7 +1755,8 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
if (!PhiR)
continue;
RecurKind RK = PhiR->getRecurrenceKind();
- if (RK != RecurKind::Add && RK != RecurKind::Mul)
+ if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
+ RK != RecurKind::AddChainWithSubs)
continue;
for (VPUser *U : collectUsersRecursively(PhiR))
@@ -1799,8 +1807,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
// other uses have different types for their operands, making them invalidly
// typed.
DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
- Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
+ VPTypeAnalysis TypeInfo(Plan);
VPBasicBlock *PH = Plan.getVectorPreheader();
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
@@ -1828,8 +1835,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
assert(OldResTy->isIntegerTy() && "only integer types supported");
(void)OldResSizeInBits;
- LLVMContext &Ctx = CanonicalIVType->getContext();
- auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+ auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
// Any wrapping introduced by shrinking this operation shouldn't be
// considered undefined behavior. So, we can't unconditionally copy
@@ -1920,13 +1926,13 @@ void VPlanTransforms::optimize(VPlan &Plan) {
runPass(removeRedundantCanonicalIVs, Plan);
runPass(removeRedundantInductionCasts, Plan);
- runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+ runPass(simplifyRecipes, Plan);
runPass(simplifyBlends, Plan);
runPass(removeDeadRecipes, Plan);
runPass(narrowToSingleScalarRecipes, Plan);
runPass(legalizeAndOptimizeInductions, Plan);
runPass(removeRedundantExpandSCEVRecipes, Plan);
- runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+ runPass(simplifyRecipes, Plan);
runPass(removeBranchOnConst, Plan);
runPass(removeDeadRecipes, Plan);
@@ -2039,11 +2045,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
return LaneMaskPhi;
}
-/// Collect all VPValues representing a header mask through the (ICMP_ULE,
-/// WideCanonicalIV, backedge-taken-count) pattern.
+/// Collect the header mask with the pattern:
+/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
/// TODO: Introduce explicit recipe for header-mask instead of searching
/// for the header-mask pattern manually.
-static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
+static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
SmallVector<VPValue *> WideCanonicalIVs;
auto *FoundWidenCanonicalIVUser =
find_if(Plan.getCanonicalIV()->users(),
@@ -2067,21 +2073,22 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
WideCanonicalIVs.push_back(WidenOriginalIV);
}
- // Walk users of wide canonical IVs and collect to all compares of the form
+ // Walk users of wide canonical IVs and find the single compare of the form
// (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
- SmallVector<VPValue *> HeaderMasks;
+ VPSingleDefRecipe *HeaderMask = nullptr;
for (auto *Wide : WideCanonicalIVs) {
for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
- auto *HeaderMask = dyn_cast<VPInstruction>(U);
- if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ if (!VPI || !vputils::isHeaderMask(VPI, Plan))
continue;
- assert(HeaderMask->getOperand(0) == Wide &&
+ assert(VPI->getOperand(0) == Wide &&
"WidenCanonicalIV must be the first operand of the compare");
- HeaderMasks.push_back(HeaderMask);
+ assert(!HeaderMask && "Multiple header masks found?");
+ HeaderMask = VPI;
}
}
- return HeaderMasks;
+ return HeaderMask;
}
void VPlanTransforms::addActiveLaneMask(
@@ -2097,6 +2104,7 @@ void VPlanTransforms::addActiveLaneMask(
[](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
assert(FoundWidenCanonicalIVUser &&
"Must have widened canonical IV when tail folding!");
+ VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
auto *WideCanonicalIV =
cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
VPSingleDefRecipe *LaneMask;
@@ -2110,11 +2118,11 @@ void VPlanTransforms::addActiveLaneMask(
"active.lane.mask");
}
- // Walk users of WideCanonicalIV and replace all compares of the form
- // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
- // active-lane-mask.
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
- HeaderMask->replaceAllUsesWith(LaneMask);
+ // Walk users of WideCanonicalIV and replace the header mask of the form
+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
+ // removing the old one to ensure there is always only a single header mask.
+ HeaderMask->replaceAllUsesWith(LaneMask);
+ HeaderMask->eraseFromParent();
}
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
@@ -2130,6 +2138,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
+ // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
+ // header mask.
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
@@ -2139,14 +2149,35 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return HeaderMask == OrigMask ? nullptr : OrigMask;
};
+ /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
+ auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
+ auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
+ if (!EndPtr)
+ return Addr;
+ assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
+ "VPVectorEndPointerRecipe with non-VF VF operand?");
+ assert(
+ all_of(EndPtr->users(),
+ [](VPUser *U) {
+ return cast<VPWidenMemoryRecipe>(U)->isReverse();
+ }) &&
+ "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
+ VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
+ EVLAddr->insertBefore(&CurRecipe);
+ EVLAddr->setOperand(1, &EVL);
+ return EVLAddr;
+ };
+
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
.Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
VPValue *NewMask = GetNewMask(L->getMask());
- return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+ VPValue *NewAddr = GetNewAddr(L->getAddr());
+ return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
})
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
- return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+ VPValue *NewAddr = GetNewAddr(S->getAddr());
+ return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
@@ -2172,9 +2203,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
- Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
- LLVMContext &Ctx = CanonicalIVType->getContext();
+ VPTypeAnalysis TypeInfo(Plan);
VPValue *AllOneMask = Plan.getTrue();
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
@@ -2183,7 +2212,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
VPWidenIntOrFpInductionRecipe>) &&
"User of VF that we can't transform to EVL.");
- Plan.getVF().replaceAllUsesWith(&EVL);
+ Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+ return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
+ });
assert(all_of(Plan.getVFxUF().users(),
[&Plan](VPUser *U) {
@@ -2213,9 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPValue *MaxEVL = &Plan.getVF();
// Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
VPBuilder Builder(LoopRegion->getPreheaderVPBB());
- MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
- TypeInfo.inferScalarType(MaxEVL),
- DebugLoc());
+ MaxEVL = Builder.createScalarZExtOrTrunc(
+ MaxEVL, Type::getInt32Ty(Plan.getContext()),
+ TypeInfo.inferScalarType(MaxEVL), DebugLoc());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
VPValue *PrevEVL =
@@ -2230,7 +2261,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
m_VPValue(V1), m_VPValue(V2))))
continue;
VPValue *Imm = Plan.getOrAddLiveIn(
- ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
+ ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_splice,
{V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
@@ -2242,47 +2273,51 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
+ VPValue *HeaderMask = findHeaderMask(Plan);
+ if (!HeaderMask)
+ return;
+
+ // Replace header masks with a mask equivalent to predicating by EVL:
+ //
+ // icmp ule widen-canonical-iv backedge-taken-count
+ // ->
+ // icmp ult step-vector, EVL
+ VPRecipeBase *EVLR = EVL.getDefiningRecipe();
+ VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
+ Type *EVLType = TypeInfo.inferScalarType(&EVL);
+ VPValue *EVLMask = Builder.createICmp(
+ CmpInst::ICMP_ULT,
+ Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
+ HeaderMask->replaceAllUsesWith(EVLMask);
+ ToErase.push_back(HeaderMask->getDefiningRecipe());
+
// Try to optimize header mask recipes away to their EVL variants.
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
- // TODO: Split optimizeMaskToEVL out and move into
- // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
- // tryToBuildVPlanWithVPRecipes beforehand.
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *CurRecipe = cast<VPRecipeBase>(U);
- VPRecipeBase *EVLRecipe =
- optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
- if (!EVLRecipe)
- continue;
+ // TODO: Split optimizeMaskToEVL out and move into
+ // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
+ // tryToBuildVPlanWithVPRecipes beforehand.
+ for (VPUser *U : collectUsersRecursively(EVLMask)) {
+ auto *CurRecipe = cast<VPRecipeBase>(U);
+ VPRecipeBase *EVLRecipe =
+ optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ if (!EVLRecipe)
+ continue;
- [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
- assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
- "New recipe must define the same number of values as the "
- "original.");
- assert(
- NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
- EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
- }
- ToErase.push_back(CurRecipe);
+ [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+ assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+ "New recipe must define the same number of values as the "
+ "original.");
+ assert(NumDefVal <= 1 &&
+ "Only supports recipes with a single definition or without users.");
+ EVLRecipe->insertBefore(CurRecipe);
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
+ VPValue *CurVPV = CurRecipe->getVPSingleValue();
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
-
- // Replace header masks with a mask equivalent to predicating by EVL:
- //
- // icmp ule widen-canonical-iv backedge-taken-count
- // ->
- // icmp ult step-vector, EVL
- VPRecipeBase *EVLR = EVL.getDefiningRecipe();
- VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
- Type *EVLType = TypeInfo.inferScalarType(&EVL);
- VPValue *EVLMask = Builder.createICmp(
- CmpInst::ICMP_ULT,
- Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
- HeaderMask->replaceAllUsesWith(EVLMask);
- ToErase.push_back(HeaderMask->getDefiningRecipe());
+ ToErase.push_back(CurRecipe);
}
+ // Remove dead EVL mask.
+ if (EVLMask->getNumUsers() == 0)
+ ToErase.push_back(EVLMask->getDefiningRecipe());
for (VPRecipeBase *R : reverse(ToErase)) {
SmallVector<VPValue *> PossiblyDead(R->operands());
@@ -2368,7 +2403,7 @@ void VPlanTransforms::addExplicitVectorLength(
Builder.setInsertPoint(CanonicalIVIncrement);
VPValue *OpVPEVL = VPEVL;
- auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
+ auto *I32Ty = Type::getInt32Ty(Plan.getContext());
OpVPEVL = Builder.createScalarZExtOrTrunc(
OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
@@ -2579,10 +2614,10 @@ void VPlanTransforms::createInterleaveGroups(
auto *InsertPos =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
- bool InBounds = false;
+ GEPNoWrapFlags NW = GEPNoWrapFlags::none();
if (auto *Gep = dyn_cast<GetElementPtrInst>(
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
- InBounds = Gep->isInBounds();
+ NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
// Get or create the start address for the interleave group.
auto *Start =
@@ -2606,8 +2641,7 @@ void VPlanTransforms::createInterleaveGroups(
VPValue *OffsetVPV =
Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
VPBuilder B(InsertPos);
- Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
- : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
+ Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
}
// If the group is reverse, adjust the index to refer to the last vector
// lane instead of the first. We adjust the index from the first vector
@@ -2616,9 +2650,7 @@ void VPlanTransforms::createInterleaveGroups(
if (IG->isReverse()) {
auto *ReversePtr = new VPVectorEndPointerRecipe(
Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
- -(int64_t)IG->getFactor(),
- InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
- InsertPos->getDebugLoc());
+ -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
ReversePtr->insertBefore(InsertPos);
Addr = ReversePtr;
}
@@ -2711,7 +2743,7 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
// Construct the initial value of the vector IV in the vector loop preheader.
Type *IVIntTy =
- IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
+ IntegerType::get(Plan->getContext(), StepTy->getScalarSizeInBits());
VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
if (StepTy->isFloatingPointTy())
Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
@@ -2838,9 +2870,8 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
R->dissolveToCFGLoop();
}
-void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
- Type &CanonicalIVTy) {
- VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
+ VPTypeAnalysis TypeInfo(Plan);
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
@@ -3204,8 +3235,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
auto *VectorPreheader = Plan.getVectorPreheader();
for (VPValue *VPV : VPValues) {
- if (all_of(VPV->users(),
- [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
+ if (vputils::onlyScalarValuesUsed(VPV) ||
(VPV->isLiveIn() && VPV->getLiveInIRValue() &&
isa<Constant>(VPV->getLiveInIRValue())))
continue;
@@ -3278,6 +3308,149 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
+void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ VPTypeAnalysis TypeInfo(Plan);
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry()));
+ auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(LoopRegion->getEntry()));
+ // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
+ // excluding ones in replicate regions. Those are not materialized explicitly
+ // yet. Those vector users are still handled in VPReplicateRegion::execute(),
+ // via shouldPack().
+ // TODO: materialize build vectors for replicating recipes in replicating
+ // regions.
+ // TODO: materialize build vectors for VPInstructions.
+ for (VPBasicBlock *VPBB :
+ concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
+ VPRegionBlock *ParentRegion =
+ cast<VPRecipeBase>(U)->getParent()->getParent();
+ return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
+ };
+ if (!RepR || RepR->isSingleScalar() ||
+ none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
+ continue;
+
+ Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+ unsigned Opcode = ScalarTy->isStructTy()
+ ? VPInstruction::BuildStructVector
+ : VPInstruction::BuildVector;
+ auto *BuildVector = new VPInstruction(Opcode, {RepR});
+ BuildVector->insertAfter(RepR);
+
+ RepR->replaceUsesWithIf(
+ BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
+ VPUser &U, unsigned) {
+ return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
+ });
+ }
+ }
+}
+
+void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue) {
+ VPValue &VectorTC = Plan.getVectorTripCount();
+ assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
+ // There's nothing to do if there are no users of the vector trip count or its
+ // IR value has already been set.
+ if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
+ return;
+
+ VPValue *TC = Plan.getTripCount();
+ Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
+ VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
+ VPValue *Step = &Plan.getVFxUF();
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+ // is accounted for in emitIterationCountCheck that adds an overflow check.
+ if (TailByMasking) {
+ TC = Builder.createNaryOp(
+ Instruction::Add,
+ {TC, Builder.createNaryOp(
+ Instruction::Sub,
+ {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ DebugLoc::getCompilerGenerated(), "n.rnd.up");
+ }
+
+ // Now we need to generate the expression for the part of the loop that the
+ // vectorized body will execute. This is equal to N - (N % Step) if scalar
+ // iterations are not required for correctness, or N - Step, otherwise. Step
+ // is equal to the vectorization factor (number of SIMD elements) times the
+ // unroll factor (number of SIMD instructions).
+ VPValue *R =
+ Builder.createNaryOp(Instruction::URem, {TC, Step},
+ DebugLoc::getCompilerGenerated(), "n.mod.vf");
+
+ // There are cases where we *must* run at least one iteration in the remainder
+ // loop. See the cost model for when this can happen. If the step evenly
+ // divides the trip count, we set the remainder to be equal to the step. If
+ // the step does not evenly divide the trip count, no adjustment is necessary
+ // since there will already be scalar iterations. Note that the minimum
+ // iterations check ensures that N >= Step.
+ if (RequiresScalarEpilogue) {
+ assert(!TailByMasking &&
+ "requiring scalar epilogue is not supported with fail folding");
+ VPValue *IsZero = Builder.createICmp(
+ CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ R = Builder.createSelect(IsZero, Step, R);
+ }
+
+ VPValue *Res = Builder.createNaryOp(
+ Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
+ VectorTC.replaceAllUsesWith(Res);
+}
+
+void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
+ ElementCount VFEC) {
+ VPBuilder Builder(VectorPH, VectorPH->begin());
+ Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ VPValue &VF = Plan.getVF();
+ VPValue &VFxUF = Plan.getVFxUF();
+ // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
+ // used.
+ // TODO: Assert that they aren't used.
+
+ // If there are no users of the runtime VF, compute VFxUF by constant folding
+ // the multiplication of VF and UF.
+ if (VF.getNumUsers() == 0) {
+ VPValue *RuntimeVFxUF =
+ Builder.createElementCount(TCTy, VFEC * Plan.getUF());
+ VFxUF.replaceAllUsesWith(RuntimeVFxUF);
+ return;
+ }
+
+ // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
+ // vscale) * UF.
+ VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
+ if (!vputils::onlyScalarValuesUsed(&VF)) {
+ VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
+ VF.replaceUsesWithIf(
+ BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
+ }
+ VF.replaceAllUsesWith(RuntimeVF);
+
+ VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ VPValue *MulByUF = Plan.getUF() == 1 ? RuntimeVF
+ : Builder.createNaryOp(Instruction::Mul,
+ {RuntimeVF, UF});
+ VFxUF.replaceAllUsesWith(MulByUF);
+}
+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
@@ -3346,9 +3519,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (VF.isScalable() || !VectorLoop)
return;
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
- Type *CanonicalIVType = CanonicalIV->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
+ VPTypeAnalysis TypeInfo(Plan);
unsigned FixedVF = VF.getFixedValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cc50c75..5b3d18b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -54,21 +54,30 @@ struct VPlanTransforms {
verifyVPlanIsValid(Plan);
}
- LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop,
- LoopInfo &LI);
-
- /// Prepare the plan for vectorization. It will introduce a dedicated
- /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit
- /// block of the main vector loop (middle.block). If a check is needed to
- /// guard executing the scalar epilogue loop, it will be added to the middle
- /// block, together with VPBasicBlocks for the scalar preheader and exit
- /// blocks. \p InductionTy is the type of the canonical induction and used for
- /// related values, like the trip count expression. It also creates a VPValue
- /// expression for the original trip count.
- LLVM_ABI_FOR_TEST static void prepareForVectorization(
- VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
- bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop,
- DebugLoc IVDL, bool HasUncountableExit, VFRange &Range);
+ /// Create a base VPlan0, serving as the common starting point for all later
+ /// candidates. It consists of an initial plain CFG loop with loop blocks from
+ /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
+ /// corresponding to the input IR.
+ ///
+ /// The created loop is wrapped in an initial skeleton to facilitate
+ /// vectorization, consisting of a vector pre-header, an exit block for the
+ /// main vector loop (middle.block) and a new block as preheader of the scalar
+ /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
+ /// InductionTy and \p IVDL, and creates a VPValue expression for the original
+ /// trip count.
+ LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
+ buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
+ PredicatedScalarEvolution &PSE);
+
+ /// Update \p Plan to account for all early exits.
+ LLVM_ABI_FOR_TEST static void
+ handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+
+ /// If a check is needed to guard executing the scalar epilogue loop, it will
+ /// be added to the middle block.
+ LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
+ bool RequiresScalarEpilogueCheck,
+ bool TailFolded);
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
/// flat CFG into a hierarchical CFG.
@@ -113,7 +122,7 @@ struct VPlanTransforms {
static void clearReductionWrapFlags(VPlan &Plan);
/// Explicitly unroll \p Plan by \p UF.
- static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
+ static void unrollByUF(VPlan &Plan, unsigned UF);
/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
/// with \p VF single-scalar recipes.
@@ -220,9 +229,8 @@ struct VPlanTransforms {
/// EVLIVInc, TripCount).
static void canonicalizeEVLLoops(VPlan &Plan);
- /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
- /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
- static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
+ /// Lower abstract recipes to concrete ones, that can be codegen'd.
+ static void convertToConcreteRecipes(VPlan &Plan);
/// This function converts initial recipes to the abstract recipes and clamps
/// \p Range based on cost model for following optimizations and cost
@@ -231,9 +239,8 @@ struct VPlanTransforms {
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range);
- /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
- /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
- static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
+ /// Perform instcombine-like simplifications on recipes in \p Plan.
+ static void simplifyRecipes(VPlan &Plan);
/// Remove BranchOnCond recipes with true or false conditions together with
/// removing dead edges to their successors.
@@ -256,11 +263,25 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Materialize vector trip count computations to a set of VPInstructions.
+ static void materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue);
+
/// Materialize the backedge-taken count to be computed explicitly using
/// VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan,
VPBasicBlock *VectorPH);
+ /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
+ /// values into single vectors.
+ static void materializeBuildVectors(VPlan &Plan);
+
+ /// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
+ static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
+ ElementCount VF);
+
/// Try to convert a plan with interleave groups with VF elements to a plan
/// with the interleave groups replaced by wide loads and stores processing VF
/// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index fc072de..62fd83a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -74,8 +74,7 @@ class UnrollState {
}
public:
- UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
- : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
+ UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
void unrollBlock(VPBlockBase *VPB);
@@ -409,7 +408,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
}
}
-void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
+void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
assert(UF > 0 && "Unroll factor must be positive");
Plan.setUF(UF);
auto Cleanup = make_scope_exit([&Plan]() {
@@ -431,7 +430,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
return;
}
- UnrollState Unroller(Plan, UF, Ctx);
+ UnrollState Unroller(Plan, UF);
// Iterate over all blocks in the plan starting from Entry, and unroll
// recipes inside them. This includes the vector preheader and middle blocks,
@@ -465,10 +464,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
VPlanTransforms::removeDeadRecipes(Plan);
}
-/// Create a single-scalar clone of \p RepR for lane \p Lane.
-static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
- Type *IdxTy, VPReplicateRecipe *RepR,
- VPLane Lane) {
+/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
+/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
+static VPReplicateRecipe *
+cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
+ VPReplicateRecipe *RepR, VPLane Lane,
+ const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : RepR->operands()) {
@@ -481,6 +482,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
}
+ // If Op is a definition that has been unrolled, directly use the clone for
+ // the corresponding lane.
+ auto LaneDefs = Def2LaneDefs.find(Op);
+ if (LaneDefs != Def2LaneDefs.end()) {
+ NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
+ continue;
+ }
+
// Look through buildvector to avoid unnecessary extracts.
if (match(Op, m_BuildVector())) {
NewOps.push_back(
@@ -513,6 +522,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
auto VPBBsToUnroll =
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
+ // A mapping of current VPValue definitions to collections of new VPValues
+ // defined per lane. Serves to hook-up potential users of current VPValue
+ // definition that are replicated-per-VF later.
+ DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
+ // The removal of current recipes being replaced by new ones needs to be
+ // delayed after Def2LaneDefs is no longer in use.
+ SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -524,12 +540,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
vputils::isSingleScalar(RepR->getOperand(1))) {
// Stores to invariant addresses need to store the last lane only.
- cloneForLane(Plan, Builder, IdxTy, RepR,
- VPLane::getLastLaneForVF(VF));
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
+ Def2LaneDefs);
} else {
// Create single-scalar version of RepR for all lanes.
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
}
RepR->eraseFromParent();
continue;
@@ -537,23 +553,33 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
/// Create single-scalar version of RepR for all lanes.
SmallVector<VPValue *> LaneDefs;
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+ LaneDefs.push_back(
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
+ Def2LaneDefs[RepR] = LaneDefs;
/// Users that only demand the first lane can use the definition for lane
/// 0.
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
return U.onlyFirstLaneUsed(RepR);
});
- // If needed, create a Build(Struct)Vector recipe to insert the scalar
- // lane values into a vector.
- Type *ResTy = RepR->getUnderlyingInstr()->getType();
- VPValue *VecRes = Builder.createNaryOp(
- ResTy->isStructTy() ? VPInstruction::BuildStructVector
- : VPInstruction::BuildVector,
- LaneDefs);
- RepR->replaceAllUsesWith(VecRes);
- RepR->eraseFromParent();
+ // Update each build vector user that currently has RepR as its only
+ // operand, to have all LaneDefs as its operands.
+ for (VPUser *U : to_vector(RepR->users())) {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
+ VPI->getOpcode() != VPInstruction::BuildStructVector))
+ continue;
+ assert(VPI->getNumOperands() == 1 &&
+ "Build(Struct)Vector must have a single operand before "
+ "replicating by VF");
+ VPI->setOperand(0, LaneDefs[0]);
+ for (VPValue *LaneDef : drop_begin(LaneDefs))
+ VPI->addOperand(LaneDef);
+ }
+ ToRemove.push_back(RepR);
}
}
+ for (auto *R : reverse(ToRemove))
+ R->eraseFromParent();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 14f20c6..b2230c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -24,6 +24,11 @@ bool vputils::onlyFirstPartUsed(const VPValue *Def) {
[Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
}
+bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
+ return all_of(Def->users(),
+ [Def](const VPUser *U) { return U->usesScalars(Def); });
+}
+
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
ScalarEvolution &SE) {
if (auto *Expanded = Plan.getSCEVExpansion(Expr))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f..3cf02b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -25,6 +25,9 @@ bool onlyFirstLaneUsed(const VPValue *Def);
/// Returns true if only the first part of \p Def is used.
bool onlyFirstPartUsed(const VPValue *Def);
+/// Returns true if only scalar values of \p Def are used by all users.
+bool onlyScalarValuesUsed(const VPValue *Def);
+
/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 3417e1c..e25ffe1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -183,6 +183,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
case Instruction::ZExt:
case Instruction::Mul:
case Instruction::FMul:
+ case VPInstruction::Broadcast:
// Opcodes above can only use EVL after wide inductions have been
// expanded.
if (!VerifyLate) {
@@ -250,17 +251,15 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
for (const VPUser *U : V->users()) {
auto *UI = cast<VPRecipeBase>(U);
if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
- for (unsigned Idx = 0; Idx != Phi->getNumIncoming(); ++Idx) {
- VPValue *IncomingVPV = Phi->getIncomingValue(Idx);
+ for (const auto &[IncomingVPV, IncomingVPBB] :
+ Phi->incoming_values_and_blocks()) {
if (IncomingVPV != V)
continue;
- const VPBasicBlock *IncomingVPBB = Phi->getIncomingBlock(Idx);
if (VPDT.dominates(VPBB, IncomingVPBB))
continue;
- errs() << "Incoming def at index " << Idx
- << " does not dominate incoming block!\n";
+ errs() << "Incoming def does not dominate incoming block!\n";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
VPSlotTracker Tracker(VPBB->getPlan());
IncomingVPV->getDefiningRecipe()->print(errs(), " ", Tracker);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6345b18..1275d53 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -29,11 +30,13 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
+#include <optional>
#include <queue>
#include <set>
@@ -74,7 +77,7 @@ public:
const DataLayout *DL, TTI::TargetCostKind CostKind,
bool TryEarlyFoldsOnly)
: F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
- DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind),
+ DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
bool run();
@@ -88,6 +91,7 @@ private:
AssumptionCache &AC;
const DataLayout *DL;
TTI::TargetCostKind CostKind;
+ const SimplifyQuery SQ;
/// If true, only perform beneficial early IR transforms. Do not introduce new
/// vector operations.
@@ -107,10 +111,8 @@ private:
const Instruction &I,
ExtractElementInst *&ConvertToShuffle,
unsigned PreferredExtractIndex);
- void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- Instruction &I);
- void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- Instruction &I);
+ Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
+ Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
bool foldExtractExtract(Instruction &I);
bool foldInsExtFNeg(Instruction &I);
bool foldInsExtBinop(Instruction &I);
@@ -137,8 +139,10 @@ private:
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
+ bool shrinkLoadForShuffles(Instruction &I);
+ bool shrinkPhiOfShuffles(Instruction &I);
- void replaceValue(Value &Old, Value &New) {
+ void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
LLVM_DEBUG(dbgs() << " With: " << New << '\n');
Old.replaceAllUsesWith(&New);
@@ -147,7 +151,11 @@ private:
Worklist.pushUsersToWorkList(*NewI);
Worklist.pushValue(NewI);
}
- Worklist.pushValue(&Old);
+ if (Erase && isInstructionTriviallyDead(&Old)) {
+ eraseInstruction(Old);
+ } else {
+ Worklist.push(&Old);
+ }
}
void eraseInstruction(Instruction &I) {
@@ -158,11 +166,23 @@ private:
// Push remaining users of the operands and then the operand itself - allows
// further folds that were hindered by OneUse limits.
- for (Value *Op : Ops)
- if (auto *OpI = dyn_cast<Instruction>(Op)) {
- Worklist.pushUsersToWorkList(*OpI);
- Worklist.pushValue(OpI);
+ SmallPtrSet<Value *, 4> Visited;
+ for (Value *Op : Ops) {
+ if (Visited.insert(Op).second) {
+ if (auto *OpI = dyn_cast<Instruction>(Op)) {
+ if (RecursivelyDeleteTriviallyDeadInstructions(
+ OpI, nullptr, nullptr, [this](Value *V) {
+ if (auto I = dyn_cast<Instruction>(V)) {
+ LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
+ Worklist.remove(I);
+ }
+ }))
+ continue;
+ Worklist.pushUsersToWorkList(*OpI);
+ Worklist.pushValue(OpI);
+ }
}
+ }
}
};
} // namespace
@@ -546,9 +566,8 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
/// the source vector (shift the scalar element) to a NewIndex for extraction.
/// Return null if the input can be constant folded, so that we are not creating
/// unnecessary instructions.
-static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
- unsigned NewIndex,
- IRBuilderBase &Builder) {
+static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
+ IRBuilderBase &Builder) {
// Shufflevectors can only be created for fixed-width vectors.
Value *X = ExtElt->getVectorOperand();
if (!isa<FixedVectorType>(X->getType()))
@@ -563,52 +582,43 @@ static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
NewIndex, Builder);
- return dyn_cast<ExtractElementInst>(
- Builder.CreateExtractElement(Shuf, NewIndex));
+ return Shuf;
}
/// Try to reduce extract element costs by converting scalar compares to vector
/// compares followed by extract.
-/// cmp (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1, Instruction &I) {
+/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
+Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
+ Instruction &I) {
assert(isa<CmpInst>(&I) && "Expected a compare");
- assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
- cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
- "Expected matching constant extract indexes");
- // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+ // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
+ // --> extelt (cmp Pred V0, V1), ExtIndex
++NumVecCmp;
CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
- Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
- Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
- replaceValue(I, *NewExt);
+ return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
}
/// Try to reduce extract element costs by converting scalar binops to vector
/// binops followed by extract.
-/// bo (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1, Instruction &I) {
+/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
+Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
+ Instruction &I) {
assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
- assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
- cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
- "Expected matching constant extract indexes");
- // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+ // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
+ // --> extelt (bo V0, V1), ExtIndex
++NumVecBO;
- Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
- Value *VecBO =
- Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+ Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
+ V1, "foldExtExtBinop");
// All IR flags are safe to back-propagate because any potential poison
// created in unused vector elements is discarded by the extract.
if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
VecBOInst->copyIRFlags(&I);
- Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
- replaceValue(I, *NewExt);
+ return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
}
/// Match an instruction with extracted vector operands.
@@ -647,25 +657,29 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
return false;
+ Value *ExtOp0 = Ext0->getVectorOperand();
+ Value *ExtOp1 = Ext1->getVectorOperand();
+
if (ExtractToChange) {
unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
- ExtractElementInst *NewExtract =
+ Value *NewExtOp =
translateExtract(ExtractToChange, CheapExtractIdx, Builder);
- if (!NewExtract)
+ if (!NewExtOp)
return false;
if (ExtractToChange == Ext0)
- Ext0 = NewExtract;
+ ExtOp0 = NewExtOp;
else
- Ext1 = NewExtract;
+ ExtOp1 = NewExtOp;
}
- if (Pred != CmpInst::BAD_ICMP_PREDICATE)
- foldExtExtCmp(Ext0, Ext1, I);
- else
- foldExtExtBinop(Ext0, Ext1, I);
-
+ Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
+ : Ext0->getIndexOperand();
+ Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
+ ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
+ : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
Worklist.push(Ext0);
Worklist.push(Ext1);
+ replaceValue(I, *NewExt);
return true;
}
@@ -1232,17 +1246,18 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
// Fold the vector constants in the original vectors into a new base vector to
// get more accurate cost modelling.
Value *NewVecC = nullptr;
- TargetFolder Folder(*DL);
if (CI)
- NewVecC = Folder.FoldCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
+ NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
else if (UO)
NewVecC =
- Folder.FoldUnOpFMF(UO->getOpcode(), VecCs[0], UO->getFastMathFlags());
+ simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
else if (BO)
- NewVecC = Folder.FoldBinOp(BO->getOpcode(), VecCs[0], VecCs[1]);
- else if (II->arg_size() == 2)
- NewVecC = Folder.FoldBinaryIntrinsic(II->getIntrinsicID(), VecCs[0],
- VecCs[1], II->getType(), &I);
+ NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
+ else if (II)
+ NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
+
+ if (!NewVecC)
+ return false;
// Get cost estimate for the insert element. This cost will factor into
// both sequences.
@@ -1250,6 +1265,7 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
InstructionCost NewCost =
ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
CostKind, *Index, NewVecC);
+
for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
II->getIntrinsicID(), Idx, &TTI)))
@@ -1294,15 +1310,6 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
ScalarInst->copyIRFlags(&I);
- // Create a new base vector if the constant folding failed.
- if (!NewVecC) {
- if (CI)
- NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
- else if (UO || BO)
- NewVecC = Builder.CreateNAryOp(Opcode, VecCs);
- else
- NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), VecCs);
- }
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
replaceValue(I, *Insert);
return true;
@@ -1790,7 +1797,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
ScalarizedCost +=
TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
Align(1), LI->getPointerAddressSpace(), CostKind);
- ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
+ ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
+ nullptr, nullptr, CostKind);
}
LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
@@ -1804,6 +1812,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
// erased in the correct order.
Worklist.push(LI);
+ Type *ElemType = VecTy->getElementType();
+
// Replace extracts with narrow scalar loads.
for (User *U : LI->users()) {
auto *EI = cast<ExtractElementInst>(U);
@@ -1817,14 +1827,20 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
Builder.SetInsertPoint(EI);
Value *GEP =
Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
- auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
- VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
- Align ScalarOpAlignment = computeAlignmentAfterScalarization(
- LI->getAlign(), VecTy->getElementType(), Idx, *DL);
+ Align ScalarOpAlignment =
+ computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
NewLoad->setAlignment(ScalarOpAlignment);
- replaceValue(*EI, *NewLoad);
+ if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
+ size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
+ AAMDNodes OldAAMD = LI->getAAMetadata();
+ NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
+ }
+
+ replaceValue(*EI, *NewLoad, false);
}
FailureGuard.release();
@@ -1856,15 +1872,15 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
unsigned ExtCnt = 0;
bool ExtLane0 = false;
for (User *U : Ext->users()) {
- const APInt *Idx;
- if (!match(U, m_ExtractElt(m_Value(), m_APInt(Idx))))
+ uint64_t Idx;
+ if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
return false;
if (cast<Instruction>(U)->use_empty())
continue;
ExtCnt += 1;
- ExtLane0 |= Idx->isZero();
+ ExtLane0 |= !Idx;
VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
- CostKind, Idx->getZExtValue(), U);
+ CostKind, Idx, U);
}
InstructionCost ScalarCost =
@@ -2910,7 +2926,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (!IL.first)
return true;
Value *V = IL.first->get();
- if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse())
+ if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
return false;
if (V->getValueID() != FrontV->getValueID())
return false;
@@ -3112,7 +3128,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
replaceValue(*Shuffle, *NewShuffle);
- MadeChanges = true;
+ return true;
}
// See if we can re-use foldSelectShuffle, getting it to reduce the size of
@@ -3608,7 +3624,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
Builder.SetInsertPoint(Shuffles[S]);
Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
- replaceValue(*Shuffles[S], *NSV);
+ replaceValue(*Shuffles[S], *NSV, false);
}
Worklist.pushValue(NSV0A);
@@ -3861,6 +3877,228 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
+// Attempt to shrink loads that are only used by shufflevector instructions.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
+ if (!OldLoad || !OldLoad->isSimple())
+ return false;
+
+ auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
+ if (!OldLoadTy)
+ return false;
+
+ unsigned const OldNumElements = OldLoadTy->getNumElements();
+
+ // Search all uses of load. If all uses are shufflevector instructions, and
+ // the second operands are all poison values, find the minimum and maximum
+ // indices of the vector elements referenced by all shuffle masks.
+ // Otherwise return `std::nullopt`.
+ using IndexRange = std::pair<int, int>;
+ auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+ IndexRange OutputRange = IndexRange(OldNumElements, -1);
+ for (llvm::Use &Use : I.uses()) {
+ // Ensure all uses match the required pattern.
+ User *Shuffle = Use.getUser();
+ ArrayRef<int> Mask;
+
+ if (!match(Shuffle,
+ m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
+ return std::nullopt;
+
+ // Ignore shufflevector instructions that have no uses.
+ if (Shuffle->use_empty())
+ continue;
+
+ // Find the min and max indices used by the shufflevector instruction.
+ for (int Index : Mask) {
+ if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
+ OutputRange.first = std::min(Index, OutputRange.first);
+ OutputRange.second = std::max(Index, OutputRange.second);
+ }
+ }
+ }
+
+ if (OutputRange.second < OutputRange.first)
+ return std::nullopt;
+
+ return OutputRange;
+ };
+
+ // Get the range of vector elements used by shufflevector instructions.
+ if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
+ unsigned const NewNumElements = Indices->second + 1u;
+
+ // If the range of vector elements is smaller than the full load, attempt
+ // to create a smaller load.
+ if (NewNumElements < OldNumElements) {
+ IRBuilder Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ // Calculate costs of old and new ops.
+ Type *ElemTy = OldLoadTy->getElementType();
+ FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
+ Value *PtrOp = OldLoad->getPointerOperand();
+
+ InstructionCost OldCost = TTI.getMemoryOpCost(
+ Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
+ InstructionCost NewCost =
+ TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
+
+ using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
+ SmallVector<UseEntry, 4u> NewUses;
+ unsigned const MaxIndex = NewNumElements * 2u;
+
+ for (llvm::Use &Use : I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+ ArrayRef<int> OldMask = Shuffle->getShuffleMask();
+
+ // Create entry for new use.
+ NewUses.push_back({Shuffle, OldMask});
+
+ // Validate mask indices.
+ for (int Index : OldMask) {
+ if (Index >= static_cast<int>(MaxIndex))
+ return false;
+ }
+
+ // Update costs.
+ OldCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
+ OldLoadTy, OldMask, CostKind);
+ NewCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
+ NewLoadTy, OldMask, CostKind);
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Found a load used only by shufflevector instructions: "
+ << I << "\n OldCost: " << OldCost
+ << " vs NewCost: " << NewCost << "\n");
+
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ // Create new load of smaller vector.
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
+ NewLoad->copyMetadata(I);
+
+ // Replace all uses.
+ for (UseEntry &Use : NewUses) {
+ ShuffleVectorInst *Shuffle = Use.first;
+ std::vector<int> &NewMask = Use.second;
+
+ Builder.SetInsertPoint(Shuffle);
+ Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+ Value *NewShuffle = Builder.CreateShuffleVector(
+ NewLoad, PoisonValue::get(NewLoadTy), NewMask);
+
+ replaceValue(*Shuffle, *NewShuffle, false);
+ }
+
+ return true;
+ }
+ }
+ return false;
+}
+
+// Attempt to narrow a phi of shufflevector instructions where the two incoming
+// values have the same operands but different masks. If the two shuffle masks
+// are offsets of one another we can use one branch to rotate the incoming
+// vector and perform one larger shuffle after the phi.
+bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
+ auto *Phi = dyn_cast<PHINode>(&I);
+ if (!Phi || Phi->getNumIncomingValues() != 2u)
+ return false;
+
+ Value *Op = nullptr;
+ ArrayRef<int> Mask0;
+ ArrayRef<int> Mask1;
+
+ if (!match(Phi->getOperand(0u),
+ m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
+ !match(Phi->getOperand(1u),
+ m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
+ return false;
+
+ auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
+
+ // Ensure result vectors are wider than the argument vector.
+ auto *InputVT = cast<FixedVectorType>(Op->getType());
+ auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
+ auto const InputNumElements = InputVT->getNumElements();
+
+ if (InputNumElements >= ResultVT->getNumElements())
+ return false;
+
+ // Take the difference of the two shuffle masks at each index. Ignore poison
+ // values at the same index in both masks.
+ SmallVector<int, 16> NewMask;
+ NewMask.reserve(Mask0.size());
+
+ for (auto [M0, M1] : zip(Mask0, Mask1)) {
+ if (M0 >= 0 && M1 >= 0)
+ NewMask.push_back(M0 - M1);
+ else if (M0 == -1 && M1 == -1)
+ continue;
+ else
+ return false;
+ }
+
+ // Ensure all elements of the new mask are equal. If the difference between
+ // the incoming mask elements is the same, the two must be constant offsets
+ // of one another.
+ if (NewMask.empty() || !all_equal(NewMask))
+ return false;
+
+ // Create new mask using difference of the two incoming masks.
+ int MaskOffset = NewMask[0u];
+ unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+ NewMask.clear();
+
+ for (unsigned I = 0u; I < InputNumElements; ++I) {
+ NewMask.push_back(Index);
+ Index = (Index + 1u) % InputNumElements;
+ }
+
+ // Calculate costs for worst cases and compare.
+ auto const Kind = TTI::SK_PermuteSingleSrc;
+ auto OldCost =
+ std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
+ TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
+ auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
+ TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Create new shuffles and narrowed phi.
+ auto Builder = IRBuilder(Shuf);
+ Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
+ auto *PoisonVal = PoisonValue::get(InputVT);
+ auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
+ Worklist.push(cast<Instruction>(NewShuf0));
+
+ Builder.SetInsertPoint(Phi);
+ Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
+ auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
+ NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
+ NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
+
+ Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
+ PoisonVal = PoisonValue::get(NewPhi->getType());
+ auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
+
+ replaceValue(*Phi, *NewShuf1);
+ return true;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -3873,8 +4111,7 @@ bool VectorCombine::run() {
LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
- bool MadeChange = false;
- auto FoldInst = [this, &MadeChange](Instruction &I) {
+ auto FoldInst = [this](Instruction &I) {
Builder.SetInsertPoint(&I);
bool IsVectorType = isa<VectorType>(I.getType());
bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
@@ -3889,10 +4126,12 @@ bool VectorCombine::run() {
if (IsFixedVectorType) {
switch (Opcode) {
case Instruction::InsertElement:
- MadeChange |= vectorizeLoadInsert(I);
+ if (vectorizeLoadInsert(I))
+ return true;
break;
case Instruction::ShuffleVector:
- MadeChange |= widenSubvectorLoad(I);
+ if (widenSubvectorLoad(I))
+ return true;
break;
default:
break;
@@ -3902,19 +4141,25 @@ bool VectorCombine::run() {
// This transform works with scalable and fixed vectors
// TODO: Identify and allow other scalable transforms
if (IsVectorType) {
- MadeChange |= scalarizeOpOrCmp(I);
- MadeChange |= scalarizeLoadExtract(I);
- MadeChange |= scalarizeExtExtract(I);
- MadeChange |= scalarizeVPIntrinsic(I);
- MadeChange |= foldInterleaveIntrinsics(I);
+ if (scalarizeOpOrCmp(I))
+ return true;
+ if (scalarizeLoadExtract(I))
+ return true;
+ if (scalarizeExtExtract(I))
+ return true;
+ if (scalarizeVPIntrinsic(I))
+ return true;
+ if (foldInterleaveIntrinsics(I))
+ return true;
}
if (Opcode == Instruction::Store)
- MadeChange |= foldSingleElementStore(I);
+ if (foldSingleElementStore(I))
+ return true;
// If this is an early pipeline invocation of this pass, we are done.
if (TryEarlyFoldsOnly)
- return;
+ return false;
// Otherwise, try folds that improve codegen but may interfere with
// early IR canonicalizations.
@@ -3923,56 +4168,87 @@ bool VectorCombine::run() {
if (IsFixedVectorType) {
switch (Opcode) {
case Instruction::InsertElement:
- MadeChange |= foldInsExtFNeg(I);
- MadeChange |= foldInsExtBinop(I);
- MadeChange |= foldInsExtVectorToShuffle(I);
+ if (foldInsExtFNeg(I))
+ return true;
+ if (foldInsExtBinop(I))
+ return true;
+ if (foldInsExtVectorToShuffle(I))
+ return true;
break;
case Instruction::ShuffleVector:
- MadeChange |= foldPermuteOfBinops(I);
- MadeChange |= foldShuffleOfBinops(I);
- MadeChange |= foldShuffleOfSelects(I);
- MadeChange |= foldShuffleOfCastops(I);
- MadeChange |= foldShuffleOfShuffles(I);
- MadeChange |= foldShuffleOfIntrinsics(I);
- MadeChange |= foldSelectShuffle(I);
- MadeChange |= foldShuffleToIdentity(I);
+ if (foldPermuteOfBinops(I))
+ return true;
+ if (foldShuffleOfBinops(I))
+ return true;
+ if (foldShuffleOfSelects(I))
+ return true;
+ if (foldShuffleOfCastops(I))
+ return true;
+ if (foldShuffleOfShuffles(I))
+ return true;
+ if (foldShuffleOfIntrinsics(I))
+ return true;
+ if (foldSelectShuffle(I))
+ return true;
+ if (foldShuffleToIdentity(I))
+ return true;
+ break;
+ case Instruction::Load:
+ if (shrinkLoadForShuffles(I))
+ return true;
break;
case Instruction::BitCast:
- MadeChange |= foldBitcastShuffle(I);
+ if (foldBitcastShuffle(I))
+ return true;
break;
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- MadeChange |= foldBitOpOfCastops(I);
+ if (foldBitOpOfCastops(I))
+ return true;
+ break;
+ case Instruction::PHI:
+ if (shrinkPhiOfShuffles(I))
+ return true;
break;
default:
- MadeChange |= shrinkType(I);
+ if (shrinkType(I))
+ return true;
break;
}
} else {
switch (Opcode) {
case Instruction::Call:
- MadeChange |= foldShuffleFromReductions(I);
- MadeChange |= foldCastFromReductions(I);
+ if (foldShuffleFromReductions(I))
+ return true;
+ if (foldCastFromReductions(I))
+ return true;
break;
case Instruction::ICmp:
case Instruction::FCmp:
- MadeChange |= foldExtractExtract(I);
+ if (foldExtractExtract(I))
+ return true;
break;
case Instruction::Or:
- MadeChange |= foldConcatOfBoolMasks(I);
+ if (foldConcatOfBoolMasks(I))
+ return true;
[[fallthrough]];
default:
if (Instruction::isBinaryOp(Opcode)) {
- MadeChange |= foldExtractExtract(I);
- MadeChange |= foldExtractedCmps(I);
- MadeChange |= foldBinopOfReductions(I);
+ if (foldExtractExtract(I))
+ return true;
+ if (foldExtractedCmps(I))
+ return true;
+ if (foldBinopOfReductions(I))
+ return true;
}
break;
}
}
+ return false;
};
+ bool MadeChange = false;
for (BasicBlock &BB : F) {
// Ignore unreachable basic blocks.
if (!DT.isReachableFromEntry(&BB))
@@ -3981,7 +4257,7 @@ bool VectorCombine::run() {
for (Instruction &I : make_early_inc_range(BB)) {
if (I.isDebugOrPseudoInst())
continue;
- FoldInst(I);
+ MadeChange |= FoldInst(I);
}
}
@@ -3995,7 +4271,7 @@ bool VectorCombine::run() {
continue;
}
- FoldInst(*I);
+ MadeChange |= FoldInst(*I);
}
return MadeChange;