aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp21
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td21
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h6
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp142
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp23
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp92
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp87
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp43
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp173
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h8
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td59
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td25
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td95
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td263
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td28
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp34
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.cpp11
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp18
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h3
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp10
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h3
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp29
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp19
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h3
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp2
-rw-r--r--llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp15
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp4
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td27
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp49
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h3
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp15
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp7
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp10
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp9
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp2
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp12
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td38
-rw-r--r--llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp20
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp10
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp20
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h2
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp51
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp5
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h21
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp19
-rw-r--r--llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp196
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td55
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp93
-rw-r--r--llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp25
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp65
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp13
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp13
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp29
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp21
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp14
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp8
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp15
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp72
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp47
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp11
135 files changed, 1915 insertions, 1015 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index e8d3161..ad8368e 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -597,6 +597,14 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
return Thunk;
}
+std::optional<std::string> getArm64ECMangledFunctionName(GlobalValue &GV) {
+ if (!GV.hasName()) {
+ GV.setName("__unnamed");
+ }
+
+ return llvm::getArm64ECMangledFunctionName(GV.getName());
+}
+
// Builds the "guest exit thunk", a helper to call a function which may or may
// not be an exit thunk. (We optimistically assume non-dllimport function
// declarations refer to functions defined in AArch64 code; if the linker
@@ -608,7 +616,7 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
getThunkType(F->getFunctionType(), F->getAttributes(),
Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty,
ArgTranslations);
- auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
+ auto MangledName = getArm64ECMangledFunctionName(*F);
assert(MangledName && "Can't guest exit to function that's already native");
std::string ThunkName = *MangledName;
if (ThunkName[0] == '?' && ThunkName.find("@") != std::string::npos) {
@@ -727,9 +735,6 @@ AArch64Arm64ECCallLowering::buildPatchableThunk(GlobalAlias *UnmangledAlias,
// Lower an indirect call with inline code.
void AArch64Arm64ECCallLowering::lowerCall(CallBase *CB) {
- assert(CB->getModule()->getTargetTriple().isOSWindows() &&
- "Only applicable for Windows targets");
-
IRBuilder<> B(CB);
Value *CalledOperand = CB->getCalledOperand();
@@ -790,7 +795,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
if (!F)
continue;
if (std::optional<std::string> MangledName =
- getArm64ECMangledFunctionName(A.getName().str())) {
+ getArm64ECMangledFunctionName(A)) {
F->addMetadata("arm64ec_unmangled_name",
*MDNode::get(M->getContext(),
MDString::get(M->getContext(), A.getName())));
@@ -807,7 +812,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
if (PersFn->getValueType() && PersFn->getValueType()->isFunctionTy()) {
if (std::optional<std::string> MangledName =
- getArm64ECMangledFunctionName(PersFn->getName().str())) {
+ getArm64ECMangledFunctionName(*PersFn)) {
PersFn->setName(MangledName.value());
}
}
@@ -821,7 +826,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
// Rename hybrid patchable functions and change callers to use a global
// alias instead.
if (std::optional<std::string> MangledName =
- getArm64ECMangledFunctionName(F.getName().str())) {
+ getArm64ECMangledFunctionName(F)) {
std::string OrigName(F.getName());
F.setName(MangledName.value() + HybridPatchableTargetSuffix);
@@ -927,7 +932,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
// FIXME: Handle functions with weak linkage?
if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
if (std::optional<std::string> MangledName =
- getArm64ECMangledFunctionName(F.getName().str())) {
+ getArm64ECMangledFunctionName(F)) {
F.addMetadata("arm64ec_unmangled_name",
*MDNode::get(M->getContext(),
MDString::get(M->getContext(), F.getName())));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b6ea86..018c16d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28609,14 +28609,16 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
PointerType::getUnqual(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie =
- M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
Type::getVoidTy(M.getContext()),
PointerType::getUnqual(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
@@ -28637,8 +28639,10 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction(Subtarget->getSecurityCheckCookieName());
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
+ return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
return TargetLowering::getSSPStackGuardCheck(M);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ea63edd8..8887657 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -887,6 +887,10 @@ private:
bool shouldScalarizeBinop(SDValue VecOp) const override {
return VecOp.getOpcode() == ISD::SETCC;
}
+
+ bool hasMultipleConditionRegisters(EVT VT) const override {
+ return VT.isScalableVector();
+ }
};
namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ba7cbcc..5a537f2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6484,7 +6484,9 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, bits<2> sz, bits<4> opc, string a
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType RegType:$Rm)))]> {
- let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+
+ let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 #
+ "|" # kind1 # "\t$Rd, $Rn, $Rm}");
}
multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
@@ -6507,7 +6509,8 @@ class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType RegType:$Rm)))]> {
- let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+ let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 #
+ "|" # kind1 # "\t$Rd, $Rn, $Rm}");
let Inst{13} = b13;
}
@@ -8986,7 +8989,8 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
(InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm,
"{\t$Rd" # kind1 # ", $Rn" # kind2 #
- ", $Rm" # kind2 # "}");
+ ", $Rm" # kind2 #
+ "|" # kind1 # "\t$Rd, $Rn, $Rm}");
}
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
@@ -9032,7 +9036,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v8bf16 V128:$Rn),
(v8bf16 V128:$Rm)))]> {
- let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
+ let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}");
}
let mayRaiseFPException = 1, Uses = [FPCR] in
@@ -9071,8 +9075,7 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
(v8bf16 V128:$Rn),
(v8bf16 V128:$Rm)))]> {
- let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
- ", $Rm", ".8h", "}");
+ let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}");
}
let mayRaiseFPException = 1, Uses = [FPCR] in
@@ -9143,7 +9146,7 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
- let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+ let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}";
}
//----------------------------------------------------------------------------
@@ -13344,8 +13347,8 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
: BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
V128, asm, ".16b", []> {
- let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn", ".16b",
- ", $Rm", ".16b", "}");
+ let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
+ "|", kind, "\t$Rd, $Rn, $Rm}");
}
multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 061ed61..d00e447 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -451,12 +451,6 @@ public:
return "__chkstk";
}
- const char* getSecurityCheckCookieName() const {
- if (isWindowsArm64EC())
- return "#__security_check_cookie_arm64ec";
- return "__security_check_cookie";
- }
-
/// Choose a method of checking LR before performing a tail call.
AArch64PAuth::AuthCheckMethod
getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e1adc0b..9f05add 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3092,6 +3092,13 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+ // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
+ // we use fcvtx under SVE2. Give them invalid costs.
+ if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
+ ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
+ DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
+ return InstructionCost::getInvalid();
+
static const TypeConversionCostTblEntry BF16Tbl[] = {
{ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
{ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
@@ -3100,6 +3107,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
};
if (ST->hasBF16())
@@ -3508,11 +3521,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
+ // Truncate from nxvmf32 to nxvmbf16.
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
+
// Truncate from nxvmf64 to nxvmf16.
{ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
+ // Truncate from nxvmf64 to nxvmbf16.
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
+
// Truncate from nxvmf64 to nxvmf32.
{ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
{ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
@@ -3523,11 +3546,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
{ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
+ // Extend from nxvmbf16 to nxvmf32.
+ {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
+ {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
+ {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
+
// Extend from nxvmf16 to nxvmf64.
{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
{ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
+ // Extend from nxvmbf16 to nxvmf64.
+ {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
+ {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
+ {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
+
// Extend from nxvmf32 to nxvmf64.
{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
@@ -4282,10 +4315,9 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
TTI::OperandValueInfo Op2Info, const Instruction *I) const {
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
// width. TODO: Improve this with different cost kinds.
- if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
+ if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
// We would need this many instructions to hide the scalarization happening.
const int AmortizationCost = 20;
@@ -4315,55 +4347,72 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
return LT.first;
}
- static const TypeConversionCostTblEntry
- VectorSelectTbl[] = {
- { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
- { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
- { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
- { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
- { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
- { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
- { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
- { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
- { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
- { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
- { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
- };
+ static const TypeConversionCostTblEntry VectorSelectTbl[] = {
+ {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
+ {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
+ {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
+ {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
+ {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
+ {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
+ {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
+ {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
+ {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
+ {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
+ {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
EVT SelCondTy = TLI->getValueType(DL, CondTy);
EVT SelValTy = TLI->getValueType(DL, ValTy);
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
- if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+ if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
SelCondTy.getSimpleVT(),
SelValTy.getSimpleVT()))
return Entry->Cost;
}
}
- if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
- Type *ValScalarTy = ValTy->getScalarType();
- if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) ||
- ValScalarTy->isBFloatTy()) {
- auto *ValVTy = cast<FixedVectorType>(ValTy);
-
- // Without dedicated instructions we promote [b]f16 compares to f32.
- auto *PromotedTy =
- VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy);
-
- InstructionCost Cost = 0;
- // Promote operands to float vectors.
- Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
- TTI::CastContextHint::None, CostKind);
- // Compare float vectors.
+ if (Opcode == Instruction::FCmp) {
+ // Without dedicated instructions we promote f16 + bf16 compares to f32.
+ if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) ||
+ ValTy->getScalarType()->isBFloatTy()) {
+ Type *PromotedTy =
+ ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext()));
+ InstructionCost Cost =
+ getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
+ TTI::CastContextHint::None, CostKind);
+ if (!Op1Info.isConstant() && !Op2Info.isConstant())
+ Cost *= 2;
Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
Op1Info, Op2Info);
- // During codegen we'll truncate the vector result from i32 to i16.
- Cost +=
- getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy),
- VectorType::getInteger(PromotedTy),
- TTI::CastContextHint::None, CostKind);
+ if (ValTy->isVectorTy())
+ Cost += getCastInstrCost(
+ Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)),
+ VectorType::getInteger(cast<VectorType>(PromotedTy)),
+ TTI::CastContextHint::None, CostKind);
return Cost;
}
+
+ auto LT = getTypeLegalizationCost(ValTy);
+ // Model unknown fp compares as a libcall.
+ if (LT.second.getScalarType() != MVT::f64 &&
+ LT.second.getScalarType() != MVT::f32 &&
+ LT.second.getScalarType() != MVT::f16)
+ return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
+ {ValTy, ValTy}, CostKind);
+
+ // Some comparison operators require expanding to multiple compares + or.
+ unsigned Factor = 1;
+ if (!CondTy->isVectorTy() &&
+ (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
+ Factor = 2; // fcmp with 2 selects
+ else if (isa<FixedVectorType>(ValTy) &&
+ (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
+ VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
+ Factor = 3; // fcmxx+fcmyy+or
+ else if (isa<ScalableVectorType>(ValTy) &&
+ (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
+ Factor = 3; // fcmxx+fcmyy+or
+
+ return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
}
// Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
@@ -4371,7 +4420,7 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
// comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
// providing it will not cause performance regressions.
if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
- ISD == ISD::SETCC && I && !CmpInst::isUnsigned(VecPred) &&
+ Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
if (match(I->getOperand(1), m_Zero()))
@@ -6235,10 +6284,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
}
- auto ShouldSinkCondition = [](Value *Cond) -> bool {
+ auto ShouldSinkCondition = [](Value *Cond,
+ SmallVectorImpl<Use *> &Ops) -> bool {
+ if (!isa<IntrinsicInst>(Cond))
+ return false;
auto *II = dyn_cast<IntrinsicInst>(Cond);
- return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
- isa<ScalableVectorType>(II->getOperand(0)->getType());
+ if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
+ !isa<ScalableVectorType>(II->getOperand(0)->getType()))
+ return false;
+ if (isa<CmpInst>(II->getOperand(0)))
+ Ops.push_back(&II->getOperandUse(0));
+ return true;
};
switch (I->getOpcode()) {
@@ -6254,7 +6310,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
break;
case Instruction::Select: {
- if (!ShouldSinkCondition(I->getOperand(0)))
+ if (!ShouldSinkCondition(I->getOperand(0), Ops))
return false;
Ops.push_back(&I->getOperandUse(0));
@@ -6264,7 +6320,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
if (cast<BranchInst>(I)->isUnconditional())
return false;
- if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+ if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
return false;
Ops.push_back(&I->getOperandUse(0));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 6912caf..7a2b679 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -79,8 +79,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value) const override;
@@ -421,9 +420,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
}
void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (shouldForceRelocation(Fixup))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -460,8 +458,8 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -471,15 +469,16 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (FulleSizeInBytes == 0) {
// Handle as little-endian
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
} else {
// Handle as big-endian
- assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
+ assert(Fixup.getOffset() + FulleSizeInBytes <= F.getSize() &&
+ "Invalid fixup size!");
assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = FulleSizeInBytes - 1 - i;
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
@@ -492,9 +491,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// If the immediate is negative, generate MOVN else MOVZ.
// (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
if (SignedValue < 0)
- Data[Offset + 3] &= ~(1 << 6);
+ Data[3] &= ~(1 << 6);
else
- Data[Offset + 3] |= (1 << 6);
+ Data[3] |= (1 << 6);
}
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 7618a57..45ac023 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -96,8 +96,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
case AArch64::S_TPREL:
case AArch64::S_TLSDESC:
case AArch64::S_TLSDESC_AUTH:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
@@ -488,7 +488,8 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val,
// this global needs to be tagged. In addition, the linker needs to know
// whether to emit a special addend when relocating `end` symbols, and this
// can only be determined by the attributes of the symbol itself.
- if (Val.getAddSym() && cast<MCSymbolELF>(Val.getAddSym())->isMemtag())
+ if (Val.getAddSym() &&
+ static_cast<const MCSymbolELF *>(Val.getAddSym())->isMemtag())
return true;
if ((Val.getSpecifier() & AArch64::S_GOT) == AArch64::S_GOT)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 6257e99..14547e3 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -418,7 +418,8 @@ private:
}
MCSymbol *emitMappingSymbol(StringRef Name) {
- auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+ auto *Symbol =
+ static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name));
emitLabel(Symbol);
return Symbol;
}
@@ -455,7 +456,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
getStreamer().getAssembler().registerSymbol(*Symbol);
- cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
+ static_cast<MCSymbolELF *>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
}
void AArch64TargetELFStreamer::finish() {
@@ -541,7 +542,7 @@ void AArch64TargetELFStreamer::finish() {
MCSectionELF *MemtagSec = nullptr;
for (const MCSymbol &Symbol : Asm.symbols()) {
- const auto &Sym = cast<MCSymbolELF>(Symbol);
+ auto &Sym = static_cast<const MCSymbolELF &>(Symbol);
if (Sym.isMemtag()) {
MemtagSec = Ctx.getELFSection(".memtag.globals.static",
ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0);
@@ -556,7 +557,7 @@ void AArch64TargetELFStreamer::finish() {
S.switchSection(MemtagSec);
const auto *Zero = MCConstantExpr::create(0, Ctx);
for (const MCSymbol &Symbol : Asm.symbols()) {
- const auto &Sym = cast<MCSymbolELF>(Symbol);
+ auto &Sym = static_cast<const MCSymbolELF &>(Symbol);
if (!Sym.isMemtag())
continue;
auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8a0c4ac..d84f512 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
"Has v_tanh_f32/f16 instructions"
>;
+def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
+ "HasTensorCvtLutInsts",
+ "true",
+ "Has v_perm_pk16* instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1359,6 +1365,13 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;
+def FeatureGloballyAddressableScratch : SubtargetFeature<
+ "globally-addressable-scratch",
+ "HasGloballyAddressableScratch",
+ "true",
+ "FLAT instructions can access scratch memory for any thread in any wave"
+>;
+
// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"DynamicVGPR",
@@ -2030,6 +2043,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTanhInsts,
+ FeatureTensorCvtLutInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
@@ -2048,6 +2062,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
+ FeatureGloballyAddressableScratch,
FeatureKernargPreload,
FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
@@ -2785,6 +2800,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
AssemblerPredicate<(all_of FeatureTanhInsts)>;
+def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
+ AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 992572f..394a143 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,18 +51,6 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;
-def gi_vop3pmodsneg :
- GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
- GIComplexPatternEquiv<VOP3PModsNeg>;
-
-def gi_vop3pmodsnegs :
- GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">,
- GIComplexPatternEquiv<VOP3PModsNegs>;
-
-def gi_dotiuvop3pmodsnegabs :
- GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">,
- GIComplexPatternEquiv<VOP3PModsNegAbs>;
-
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
@@ -452,6 +440,13 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_VOP3PModsNeg : GICustomOperandRenderer<"renderVOP3PModsNeg">,
+ GISDNodeXFormEquiv<VOP3PModsNeg>;
+def gi_VOP3PModsNegs : GICustomOperandRenderer<"renderVOP3PModsNegs">,
+ GISDNodeXFormEquiv<VOP3PModsNegs>;
+def gi_VOP3PModsNegAbs : GICustomOperandRenderer<"renderVOP3PModsNegAbs">,
+ GISDNodeXFormEquiv<VOP3PModsNegAbs>;
+
def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
GISDNodeXFormEquiv<PrefetchLoc>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 39b4200..fb83388 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3449,63 +3449,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
-// Select neg_lo from the i1 immediate operand.
-bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
- const ConstantSDNode *C = cast<ConstantSDNode>(In);
- // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
- // 1 promotes packed values to signed, 0 treats them as unsigned.
- assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
-
- unsigned Mods = SISrcMods::OP_SEL_1;
- unsigned SrcSign = C->getZExtValue();
- if (SrcSign == 1)
- Mods ^= SISrcMods::NEG;
-
- Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
- return true;
-}
-
-// Select both neg_lo and neg_hi from the i1 immediate operand. This is
-// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
-// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
-bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const {
- const ConstantSDNode *C = cast<ConstantSDNode>(In);
- // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
- // 1 promotes packed values to signed, 0 treats them as unsigned.
- assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
-
- unsigned Mods = SISrcMods::OP_SEL_1;
- unsigned SrcSign = C->getZExtValue();
- if (SrcSign == 1)
- Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
-
- Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
- return true;
-}
-
-// Select neg, abs, or both neg and abs from the i16 immediate operans.
-bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const {
- const ConstantSDNode *C = cast<ConstantSDNode>(In);
- unsigned Mods = SISrcMods::OP_SEL_1;
- unsigned SrcMod = C->getZExtValue();
- switch (SrcMod) {
- default: // Any other value will be silently ignored (considered as 0).
- break;
- case 1:
- Mods ^= SISrcMods::NEG;
- break;
- case 2:
- Mods ^= SISrcMods::ABS;
- break;
- case 3:
- Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
- break;
- }
-
- Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
- return true;
-}
-
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 983f1aa..16388e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -241,9 +241,6 @@ private:
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
- bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const;
- bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 31c4f62..64e68ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -589,14 +589,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
- // FIXME: This is only partially true. If we have to do vector compares, any
- // SGPR pair can be a condition register. If we have a uniform condition, we
- // are better off doing SALU operations, where there is only one SCC. For now,
- // we don't have a way of knowing during instruction selection if a condition
- // will be uniform and we always use vector compares. Assume we are using
- // vector compares until that is fixed.
- setHasMultipleConditionRegisters(true);
-
setMinCmpXchgSizeInBits(32);
setSupportsUnalignedAtomics(false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 39bb0ad..fd5d5b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,6 +388,16 @@ public:
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
+
+ bool hasMultipleConditionRegisters(EVT VT) const override {
+ // FIXME: This is only partially true. If we have to do vector compares, any
+ // SGPR pair can be a condition register. If we have a uniform condition, we
+ // are better off doing SALU operations, where there is only one SCC. For
+ // now, we don't have a way of knowing during instruction selection if a
+ // condition will be uniform and we always use vector compares. Assume we
+ // are using vector compares until that is fixed.
+ return true;
+ }
};
namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index f2207ff..4fe5d00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1694,7 +1694,9 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
- case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
Value *Src0 = II.getArgOperand(1);
Value *Src1 = II.getArgOperand(3);
unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b0d3b12..b7fd131 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4988,66 +4988,6 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
return selectVOP3PRetHelper(Root, true);
}
-// Select neg_lo from the i1 immediate operand.
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
- // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
- // Value is in Imm operand as i1 sign extended to int64_t.
- // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
- assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
- "expected i1 value");
- unsigned Mods = SISrcMods::OP_SEL_1;
- if (Root.getImm() == -1)
- Mods ^= SISrcMods::NEG;
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
-}
-
-// Select both neg_lo and neg_hi from the i1 immediate operand. This is
-// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
-// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const {
- // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
- // Value is in Imm operand as i1 sign extended to int64_t.
- // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
- assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
- "expected i1 value");
- unsigned Mods = SISrcMods::OP_SEL_1;
- if (Root.getImm() == -1)
- Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
-}
-
-// Select neg, abs, or both neg and abs from the i16 immediate operans.
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const {
-
- assert(Root.isImm() && "Modifier for C must be an immediate");
-
- unsigned Mods = SISrcMods::OP_SEL_1;
- switch (Root.getImm()) {
- default: // Any other value will be silently ignored (considered as 0).
- break;
- case 1:
- Mods ^= SISrcMods::NEG;
- break;
- case 2:
- Mods ^= SISrcMods::ABS;
- break;
- case 3:
- Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
- break;
- }
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
MachineOperand &Root) const {
@@ -7102,6 +7042,38 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (MI.getOperand(OpIdx).getImm())
+ Mods ^= SISrcMods::NEG;
+ MIB.addImm((int64_t)Mods);
+}
+
+void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (MI.getOperand(OpIdx).getImm())
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ MIB.addImm((int64_t)Mods);
+}
+
+void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ unsigned Val = MI.getOperand(OpIdx).getImm();
+ unsigned Mods = SISrcMods::OP_SEL_1; // default: none
+ if (Val == 1) // neg
+ Mods ^= SISrcMods::NEG;
+ if (Val == 2) // abs
+ Mods ^= SISrcMods::ABS;
+ if (Val == 3) // neg and abs
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ MIB.addImm((int64_t)Mods);
+}
+
void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 140e753..c9da419 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -200,13 +200,6 @@ private:
selectVOP3PModsDOT(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectVOP3PModsNeg(MachineOperand &Root) const;
- InstructionSelector::ComplexRendererFns
- selectVOP3PModsNegs(MachineOperand &Root) const;
- InstructionSelector::ComplexRendererFns
- selectVOP3PModsNegAbs(MachineOperand &Root) const;
-
- InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -419,6 +412,13 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderVOP3PModsNeg(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+ void renderVOP3PModsNegs(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+ void renderVOP3PModsNegAbs(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1fdf272..a6e4a63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !ST.hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// FIXME: It would be more natural to emit a COPY here, but then copy
// coalescing would kick in and it would think it's okay to use the "HI"
// subregister (instead of extracting the HI 32 bits) which is an artificial
@@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+ auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ const LLT S32 = LLT::scalar(32);
+ Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
+ Register FlatScratchBaseLo =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
+ Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
+ // Extract low 32-bits of the pointer.
+ return B.buildExtract(Dst, Src, 0).getReg(0);
+ };
+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
- // Extract low 32-bits of the pointer.
- B.buildExtract(Dst, Src, 0);
+ castFlatToLocalOrPrivate(Dst);
MI.eraseFromParent();
return true;
}
@@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto FlatNull = B.buildConstant(SrcTy, 0);
// Extract low 32-bits of the pointer.
- auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
+ auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
@@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
- Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
- if (!ApertureReg.isValid())
- return false;
-
// Coerce the type of the low half of the result so we can use
// merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ Register AllOnes = B.buildConstant(S32, -1).getReg(0);
+ Register ThreadID = B.buildConstant(S32, 0).getReg(0);
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ if (ST.isWave64()) {
+ ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
+ .addUse(AllOnes)
+ .addUse(ThreadID)
+ .getReg(0);
+ }
+ Register ShAmt =
+ B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
+ Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
+ Register CvtPtr =
+ B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ Register FlatScratchBase =
+ B.buildInstr(AMDGPU::S_MOV_B64, {S64},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
+ return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
+ }
+
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
- Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ const LLT S32 = LLT::scalar(32);
+ auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
- B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
+ ST.hasGloballyAddressableScratch()) {
+ Register FlatScratchBaseHi =
+ B.buildInstr(AMDGPU::S_MOV_B32, {S32},
+ {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
+ .getReg(0);
+ MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
+ // Test bits 63..58 against the aperture address.
+ Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
+ B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
+ B.buildConstant(S32, 1u << 26));
+ } else {
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ }
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index d443f4e..2d8f259 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -236,7 +236,7 @@ cl::opt<LoweringKind> LoweringKindLoc(
"Lower via mixture of above strategies")));
template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
- llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
+ llvm::sort(V, [](const auto *L, const auto *R) {
return L->getName() < R->getName();
});
return {std::move(V)};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5aa0ebf..868b1a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4603,6 +4603,42 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
+ case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
@@ -4762,7 +4798,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
@@ -4777,6 +4817,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
+ case Intrinsic::amdgcn_perm_pk16_b4_u4:
+ case Intrinsic::amdgcn_perm_pk16_b6_u4:
+ case Intrinsic::amdgcn_perm_pk16_b8_u4:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a83caa0..ff8efd2 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -178,6 +178,10 @@ public:
ImmTyBitOp3,
ImmTyMatrixAFMT,
ImmTyMatrixBFMT,
+ ImmTyMatrixAScale,
+ ImmTyMatrixBScale,
+ ImmTyMatrixAScaleFmt,
+ ImmTyMatrixBScaleFmt,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
ImmTyScaleSel,
@@ -428,6 +432,10 @@ public:
bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
+ bool isMatrixAScale() const { return isImmTy(ImmTyMatrixAScale); }
+ bool isMatrixBScale() const { return isImmTy(ImmTyMatrixBScale); }
+ bool isMatrixAScaleFmt() const { return isImmTy(ImmTyMatrixAScaleFmt); }
+ bool isMatrixBScaleFmt() const { return isImmTy(ImmTyMatrixBScaleFmt); }
bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -1183,6 +1191,10 @@ public:
case ImmTyBitOp3: OS << "BitOp3"; break;
case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
+ case ImmTyMatrixAScale: OS << "ImmTyMatrixAScale"; break;
+ case ImmTyMatrixBScale: OS << "ImmTyMatrixBScale"; break;
+ case ImmTyMatrixAScaleFmt: OS << "ImmTyMatrixAScaleFmt"; break;
+ case ImmTyMatrixBScaleFmt: OS << "ImmTyMatrixBScaleFmt"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
case ImmTyScaleSel: OS << "ScaleSel" ; break;
@@ -1608,6 +1620,10 @@ public:
return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding];
}
+ bool hasGloballyAddressableScratch() const {
+ return getFeatureBits()[AMDGPU::FeatureGloballyAddressableScratch];
+ }
+
unsigned getNSAMaxSize(bool HasSampler = false) const {
return AMDGPU::getNSAMaxSize(getSTI(), HasSampler);
}
@@ -1728,6 +1744,14 @@ public:
AMDGPUOperand::ImmTy Type);
ParseStatus parseMatrixAFMT(OperandVector &Operands);
ParseStatus parseMatrixBFMT(OperandVector &Operands);
+ ParseStatus tryParseMatrixScale(OperandVector &Operands, StringRef Name,
+ AMDGPUOperand::ImmTy Type);
+ ParseStatus parseMatrixAScale(OperandVector &Operands);
+ ParseStatus parseMatrixBScale(OperandVector &Operands);
+ ParseStatus tryParseMatrixScaleFmt(OperandVector &Operands, StringRef Name,
+ AMDGPUOperand::ImmTy Type);
+ ParseStatus parseMatrixAScaleFmt(OperandVector &Operands);
+ ParseStatus parseMatrixBScaleFmt(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@@ -2739,46 +2763,48 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
static MCRegister getSpecialRegForName(StringRef RegName) {
return StringSwitch<unsigned>(RegName)
- .Case("exec", AMDGPU::EXEC)
- .Case("vcc", AMDGPU::VCC)
- .Case("flat_scratch", AMDGPU::FLAT_SCR)
- .Case("xnack_mask", AMDGPU::XNACK_MASK)
- .Case("shared_base", AMDGPU::SRC_SHARED_BASE)
- .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
- .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
- .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
- .Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
- .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
- .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
- .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
- .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
- .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
- .Case("lds_direct", AMDGPU::LDS_DIRECT)
- .Case("src_lds_direct", AMDGPU::LDS_DIRECT)
- .Case("m0", AMDGPU::M0)
- .Case("vccz", AMDGPU::SRC_VCCZ)
- .Case("src_vccz", AMDGPU::SRC_VCCZ)
- .Case("execz", AMDGPU::SRC_EXECZ)
- .Case("src_execz", AMDGPU::SRC_EXECZ)
- .Case("scc", AMDGPU::SRC_SCC)
- .Case("src_scc", AMDGPU::SRC_SCC)
- .Case("tba", AMDGPU::TBA)
- .Case("tma", AMDGPU::TMA)
- .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
- .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
- .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
- .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
- .Case("vcc_lo", AMDGPU::VCC_LO)
- .Case("vcc_hi", AMDGPU::VCC_HI)
- .Case("exec_lo", AMDGPU::EXEC_LO)
- .Case("exec_hi", AMDGPU::EXEC_HI)
- .Case("tma_lo", AMDGPU::TMA_LO)
- .Case("tma_hi", AMDGPU::TMA_HI)
- .Case("tba_lo", AMDGPU::TBA_LO)
- .Case("tba_hi", AMDGPU::TBA_HI)
- .Case("pc", AMDGPU::PC_REG)
- .Case("null", AMDGPU::SGPR_NULL)
- .Default(AMDGPU::NoRegister);
+ .Case("exec", AMDGPU::EXEC)
+ .Case("vcc", AMDGPU::VCC)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("xnack_mask", AMDGPU::XNACK_MASK)
+ .Case("shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("src_flat_scratch_base_lo", AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)
+ .Case("src_flat_scratch_base_hi", AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)
+ .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("lds_direct", AMDGPU::LDS_DIRECT)
+ .Case("src_lds_direct", AMDGPU::LDS_DIRECT)
+ .Case("m0", AMDGPU::M0)
+ .Case("vccz", AMDGPU::SRC_VCCZ)
+ .Case("src_vccz", AMDGPU::SRC_VCCZ)
+ .Case("execz", AMDGPU::SRC_EXECZ)
+ .Case("src_execz", AMDGPU::SRC_EXECZ)
+ .Case("scc", AMDGPU::SRC_SCC)
+ .Case("src_scc", AMDGPU::SRC_SCC)
+ .Case("tba", AMDGPU::TBA)
+ .Case("tma", AMDGPU::TMA)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
+ .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
+ .Case("vcc_lo", AMDGPU::VCC_LO)
+ .Case("vcc_hi", AMDGPU::VCC_HI)
+ .Case("exec_lo", AMDGPU::EXEC_LO)
+ .Case("exec_hi", AMDGPU::EXEC_HI)
+ .Case("tma_lo", AMDGPU::TMA_LO)
+ .Case("tma_hi", AMDGPU::TMA_HI)
+ .Case("tba_lo", AMDGPU::TBA_LO)
+ .Case("tba_hi", AMDGPU::TBA_HI)
+ .Case("pc", AMDGPU::PC_REG)
+ .Case("null", AMDGPU::SGPR_NULL)
+ .Default(AMDGPU::NoRegister);
}
bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
@@ -6724,6 +6750,9 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case SRC_PRIVATE_LIMIT_LO:
case SRC_PRIVATE_LIMIT:
return isGFX9Plus();
+ case SRC_FLAT_SCRATCH_BASE_LO:
+ case SRC_FLAT_SCRATCH_BASE_HI:
+ return hasGloballyAddressableScratch();
case SRC_POPS_EXITING_WAVE_ID:
return isGFX9Plus() && !isGFX11Plus();
case TBA:
@@ -7356,6 +7385,42 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
AMDGPUOperand::ImmTyMatrixBFMT);
}
+ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands,
+ StringRef Name,
+ AMDGPUOperand::ImmTy Type) {
+ return parseStringOrIntWithPrefix(
+ Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) {
+ return tryParseMatrixScale(Operands, "matrix_a_scale",
+ AMDGPUOperand::ImmTyMatrixAScale);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) {
+ return tryParseMatrixScale(Operands, "matrix_b_scale",
+ AMDGPUOperand::ImmTyMatrixBScale);
+}
+
+ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands,
+ StringRef Name,
+ AMDGPUOperand::ImmTy Type) {
+ return parseStringOrIntWithPrefix(
+ Operands, Name,
+ {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"},
+ Type);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) {
+ return tryParseMatrixScaleFmt(Operands, "matrix_a_scale_fmt",
+ AMDGPUOperand::ImmTyMatrixAScaleFmt);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixBScaleFmt(OperandVector &Operands) {
+ return tryParseMatrixScaleFmt(Operands, "matrix_b_scale_fmt",
+ AMDGPUOperand::ImmTyMatrixBScaleFmt);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9489,6 +9554,34 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
AMDGPUOperand::ImmTyMatrixBFMT, 0);
}
+ int MatrixAScaleIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale);
+ if (MatrixAScaleIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAScale, 0);
+ }
+
+ int MatrixBScaleIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale);
+ if (MatrixBScaleIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBScale, 0);
+ }
+
+ int MatrixAScaleFmtIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale_fmt);
+ if (MatrixAScaleFmtIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAScaleFmt, 0);
+ }
+
+ int MatrixBScaleFmtIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale_fmt);
+ if (MatrixBScaleFmtIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBScaleFmt, 0);
+ }
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixAReuse, 0);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ffe6b06..fb7d634 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -598,6 +598,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
+ if (isGFX1250() && Bytes.size() >= 16) {
+ DecoderUInt128 DecW = eat16Bytes(Bytes);
+ if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
+ break;
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+ }
+
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
@@ -1907,6 +1914,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
+ case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
+ case 231: return createRegOperand(SRC_FLAT_SCRATCH_BASE_HI);
case 235: return createRegOperand(SRC_SHARED_BASE_LO);
case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
@@ -1940,6 +1949,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return createRegOperand(SGPR_NULL);
break;
case 126: return createRegOperand(EXEC);
+ case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
case 237: return createRegOperand(SRC_PRIVATE_BASE);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6fe3abc..5530886 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -236,6 +236,7 @@ protected:
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
bool HasTanhInsts = false;
+ bool HasTensorCvtLutInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -280,6 +281,7 @@ protected:
bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;
+ bool HasGloballyAddressableScratch = false;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1324,6 +1326,10 @@ public:
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
+ bool hasGloballyAddressableScratch() const {
+ return HasGloballyAddressableScratch;
+ }
+
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
@@ -1411,6 +1417,8 @@ public:
bool hasTanhInsts() const { return HasTanhInsts; }
+ bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
+
bool hasAddPC64Inst() const { return GFX1250Insts; }
bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 86d56855..4e4660c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -33,8 +33,7 @@ public:
AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value) const override;
@@ -129,9 +128,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (Target.getSpecifier())
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -148,13 +146,13 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value <<= Info.TargetOffset;
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- uint32_t Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits from
// the fixup value.
for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+ Data[i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
std::optional<MCFixupKind>
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 42c4d8b..ee8683a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1393,6 +1393,75 @@ void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
printMatrixFMT(MI, OpNo, STI, O, 'b');
}
+void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, char AorB) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 1;
+ if (Imm == 0)
+ return;
+
+ O << " matrix_" << AorB << "_scale:";
+ switch (Imm) {
+ default:
+ O << Imm;
+ break;
+ case WMMA::MatrixScale::MATRIX_SCALE_ROW0:
+ O << "MATRIX_SCALE_ROW0";
+ break;
+ case WMMA::MatrixScale::MATRIX_SCALE_ROW1:
+ O << "MATRIX_SCALE_ROW1";
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixScale(MI, OpNo, STI, O, 'a');
+}
+
+void AMDGPUInstPrinter::printMatrixBScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixScale(MI, OpNo, STI, O, 'b');
+}
+
+void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, char AorB) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 3;
+ if (Imm == 0)
+ return;
+
+ O << " matrix_" << AorB << "_scale_fmt:";
+ switch (Imm) {
+ default:
+ O << Imm;
+ break;
+ case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8:
+ O << "MATRIX_SCALE_FMT_E8";
+ break;
+ case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3:
+ O << "MATRIX_SCALE_FMT_E5M3";
+ break;
+ case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3:
+ O << "MATRIX_SCALE_FMT_E4M3";
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixScaleFmt(MI, OpNo, STI, O, 'a');
+}
+
+void AMDGPUInstPrinter::printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixScaleFmt(MI, OpNo, STI, O, 'b');
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index f6739b14..be32061c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -140,6 +140,19 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
+ void printMatrixAScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixBScale(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O,
+ char AorB);
+ void printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index ffdac8b..fa0c95f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -75,8 +75,9 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
if (STI->hasFeature(AMDGPU::FeatureNSAEncoding))
return 20;
- // VOP3PX encoding.
- if (STI->hasFeature(AMDGPU::FeatureGFX950Insts))
+ // VOP3PX/VOP3PX2 encoding.
+ if (STI->hasFeature(AMDGPU::FeatureGFX950Insts) ||
+ STI->hasFeature(AMDGPU::FeatureGFX1250Insts))
return 16;
// 64-bit instruction with 32-bit literal.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 43ca548..68302f0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -872,14 +872,14 @@ void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
unsigned Type) {
- MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ auto *Symbol = static_cast<MCSymbolELF *>(
getStreamer().getContext().getOrCreateSymbol(SymbolName));
Symbol->setType(Type);
}
void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
Align Alignment) {
- MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
+ auto *SymbolELF = static_cast<MCSymbolELF *>(Symbol);
SymbolELF->setType(ELF::STT_OBJECT);
if (!SymbolELF->isBindingSet())
@@ -974,9 +974,9 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
- MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
- Context.getOrCreateSymbol(Twine(KernelName)));
- MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+ auto *KernelCodeSymbol =
+ static_cast<MCSymbolELF *>(Context.getOrCreateSymbol(Twine(KernelName)));
+ auto *KernelDescriptorSymbol = static_cast<MCSymbolELF *>(
Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
// Copy kernel descriptor symbol's binding, other and visibility from the
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index c564145..deadb7a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1018,6 +1018,17 @@ enum MatrixFMT : unsigned {
MATRIX_FMT_BF6 = 3,
MATRIX_FMT_FP4 = 4
};
+
+enum MatrixScale : unsigned {
+ MATRIX_SCALE_ROW0 = 0,
+ MATRIX_SCALE_ROW1 = 1,
+};
+
+enum MatrixScaleFmt : unsigned {
+ MATRIX_SCALE_FMT_E8 = 0,
+ MATRIX_SCALE_FMT_E5M3 = 1,
+ MATRIX_SCALE_FMT_E4M3 = 2
+};
} // namespace WMMA
namespace VOP3PEncoding {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e934152..0c653b1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1169,11 +1169,18 @@ void SIFoldOperandsImpl::foldOperand(
// Grab the use operands first
SmallVector<MachineOperand *, 4> UsesToProcess(
llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
- for (auto *RSUse : UsesToProcess) {
+ for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
+ MachineOperand *RSUse = UsesToProcess[I];
MachineInstr *RSUseMI = RSUse->getParent();
unsigned OpNo = RSUseMI->getOperandNo(RSUse);
if (SplatRC) {
+ if (RSUseMI->isCopy()) {
+ Register DstReg = RSUseMI->getOperand(0).getReg();
+ append_range(UsesToProcess,
+ make_pointer_range(MRI->use_nodbg_operands(DstReg)));
+ continue;
+ }
if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
FoldableDef SplatDef(SplatVal, SplatRC);
appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d67e4a..63826b7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- // Flat -> private/local is a simple truncate.
- // Flat -> global is no-op
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
+ return false;
+ }
+
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
return true;
+ }
const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
+ !Subtarget->hasGloballyAddressableScratch()) &&
+ "Cannot use src_private_base with globally addressable scratch!");
// Note: this feature (register) is broken. When used as a 32-bit operand,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // flat -> private with globally addressable scratch: subtract
+ // src_flat_scratch_base_lo.
+ SDValue FlatScratchBaseLo(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, SL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
+ 0);
+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ }
+
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;
@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
-
- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- SDValue CvtPtr =
- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ SDValue CvtPtr;
+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
+ AllOnes, ThreadID);
+ if (Subtarget->isWave64())
+ ThreadID = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
+ AllOnes, ThreadID);
+ SDValue ShAmt = DAG.getShiftAmountConstant(
+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
+ // 64-bit hi:lo value.
+ SDValue FlatScratchBase = {
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B64, SL, MVT::i64,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
+ 0};
+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
+ } else {
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+ }
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
SDLoc SL(Op);
- unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
- ? AMDGPUAS::LOCAL_ADDRESS
- : AMDGPUAS::PRIVATE_ADDRESS;
- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
SDValue SrcVec =
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
-
SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
DAG.getConstant(1, SL, MVT::i32));
+
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
+ ? AMDGPUAS::LOCAL_ADDRESS
+ : AMDGPUAS::PRIVATE_ADDRESS;
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+ Subtarget->hasGloballyAddressableScratch()) {
+ SDValue FlatScratchBaseHi(
+ DAG.getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
+ 0);
+ // Test bits 63..58 against the aperture address.
+ return DAG.getSetCC(
+ SL, MVT::i1,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
+ }
+
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
case Intrinsic::amdgcn_perm:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a3e20ba..c552f1a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -908,6 +908,32 @@ def SupportedRoundMode : TImmLeaf<i32, [{
Imm == (int)RoundingMode::TowardNegative;
}]>;
+def VOP3PModsNeg : SDNodeXForm<timm, [{
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (N->getZExtValue())
+ Mods ^= SISrcMods::NEG;
+ return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32);
+}]>;
+
+def VOP3PModsNegs : SDNodeXForm<timm, [{
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (N->getZExtValue())
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32);
+}]>;
+
+def VOP3PModsNegAbs : SDNodeXForm<timm, [{
+ unsigned Val = N->getZExtValue();
+ unsigned Mods = SISrcMods::OP_SEL_1; // default: none
+ if (Val == 1) // neg
+ Mods ^= SISrcMods::NEG;
+ if (Val == 2) // abs
+ Mods ^= SISrcMods::ABS;
+ if (Val == 3) // neg and abs
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32);
+}]>;
+
class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
uint64_t Imm = N->getZExtValue();
unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
@@ -1310,6 +1336,12 @@ def bitop3_0 : DefaultOperand<BitOp3, 0>;
def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
+def MatrixAScale : CustomOperand<i32, 1, "MatrixAScale">;
+def MatrixBScale : CustomOperand<i32, 1, "MatrixBScale">;
+
+def MatrixAScaleFmt : CustomOperand<i32, 1, "MatrixAScaleFmt">;
+def MatrixBScaleFmt : CustomOperand<i32, 1, "MatrixBScaleFmt">;
+
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
@@ -1647,9 +1679,6 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
-def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
-def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern?
-def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
@@ -1774,6 +1803,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
!eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
+ !eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
!eq(VT.Size, 16) : op16,
@@ -1924,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
+ !eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}
@@ -2678,6 +2709,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasNeg = HasModifiers;
field bit HasMatrixReuse = 0;
field bit HasMatrixFMT = 0;
+ field bit HasMatrixScale = 0;
+ field bit HasMatrixReuse = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2935,6 +2968,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
+def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>;
+def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>;
+def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
@@ -2948,6 +2984,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
+def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>;
+def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>;
def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>;
def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>;
def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>;
@@ -2955,11 +2993,26 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
+def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>;
+def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>;
+def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>;
+def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
+def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
+def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
+def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
+def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>;
+def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>;
+def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 54fa192..bd5dfa9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3543,14 +3543,21 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
-}
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
(vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
>;
+} // End True16Predicate = ...
} // End foreach Ty = ...
-}
+} // End AddedComplexity = 1
+
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16,
+ (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), hi16)
+>;
let SubtargetPredicate = HasVOP3PInsts in {
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
@@ -3599,7 +3606,11 @@ def : GCNPat <
>;
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
- (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (Ty (IMPLICIT_DEF)), hi16)
+>;
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_16:$src1))),
+ (REG_SEQUENCE VGPR_32, (Ty (IMPLICIT_DEF)), lo16, (Ty VGPR_16:$src1), hi16)
>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f3acc5c..ae0f304 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -598,6 +598,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
// Reserve async counters pseudo registers
reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 36d1a3b..81655f5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -246,6 +246,22 @@ defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>;
defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>;
defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>;
+let isConstant = true in {
+ defm SRC_FLAT_SCRATCH_BASE_LO : SIRegLoHi16<"src_flat_scratch_base_lo", 230>;
+ defm SRC_FLAT_SCRATCH_BASE_HI : SIRegLoHi16<"src_flat_scratch_base_hi", 231>;
+
+ // Using src_flat_scratch_base_lo in a 64-bit context gets the full 64-bit
+ // hi:lo value.
+ def SRC_FLAT_SCRATCH_BASE :
+ RegisterWithSubRegs<"src_flat_scratch_base_lo",
+ [SRC_FLAT_SCRATCH_BASE_LO,
+ SRC_FLAT_SCRATCH_BASE_HI]> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = SRC_FLAT_SCRATCH_BASE_LO.HWEncoding;
+ }
+}
+
defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>;
// Not addressable
@@ -765,7 +781,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID,
- SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
+ SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> {
let AllocationPriority = 0;
}
@@ -776,7 +792,8 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16,
SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16,
- SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> {
+ SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16,
+ SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
let Size = 16;
let isAllocatable = 0;
let BaseClassOrder = 16;
@@ -849,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16],
def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
- SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
+ SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
+ SRC_FLAT_SCRATCH_BASE)> {
let CopyCost = 1;
let AllocationPriority = 1;
let HasSGPR = 1;
@@ -1302,6 +1320,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
// True 16 Operands
def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 65fa088..00dcb9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2654,6 +2654,8 @@ bool isInlineValue(unsigned Reg) {
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_FLAT_SCRATCH_BASE_LO:
+ case AMDGPU::SRC_FLAT_SCRATCH_BASE_HI:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
return true;
case AMDGPU::SRC_VCCZ:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f621f85..b128207 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -107,18 +107,6 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_DPP_Pseudo <OpName, P, pattern> {
}
-class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
- list<dag> ret =
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))],
- !if(P.HasOMod,
- [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
- i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
- )
- );
-}
-
multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag, int VOPDOp = -1> {
// We only want to set this on the basic, non-SDWA or DPP forms.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 19ce7f5..f4b6af6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no
}
}
+let HasExtVOP3DPP = 0, HasModifiers = 0 in {
+def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>;
+def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>;
+def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>;
+}
+
let Src0RC64 = VSrc_NoInline_v2f16 in {
def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
@@ -1771,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>;
defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>;
+ defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>;
+ defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>;
+ defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>;
+ defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>;
+ defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>;
+ defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>;
} // End Constraints = "@earlyclobber $vdst"
defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>;
@@ -1778,6 +1790,44 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
} // End ReadsModeReg = 0
+ let Constraints = "@earlyclobber $vdst" in {
+ let WaveSizePredicate = isWave32 in {
+ defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>;
+ defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>;
+ defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>;
+ defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>;
+ defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>;
+ } // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>;
+ defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>;
+
+ let WaveSizePredicate = isWave32 in {
+ defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
+ defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
+ } // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
+ } // End Constraints = "@earlyclobber $vdst"
+
let True16Predicate = UseRealTrue16Insts in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
@@ -1788,6 +1838,12 @@ let SubtargetPredicate = isGFX1250Plus in {
}
} // End SubtargetPredicate = isGFX1250Plus
+let SubtargetPredicate = HasTensorCvtLutInsts in {
+ defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>;
+ defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>;
+ defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>;
+} // End SubtargetPredicate = HasTensorCvtLutInsts
+
class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
@@ -2186,6 +2242,9 @@ let AssemblerPredicate = isGFX11Plus in {
}
// These instructions differ from GFX12 variant by supporting DPP:
+defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
+defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
+defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
@@ -2198,6 +2257,42 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
+defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>;
+defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>;
+defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>;
+defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>;
+defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>;
+defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>;
+defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>;
+defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>;
+defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>;
+defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>;
+defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>;
+defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>;
+defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>;
+defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>;
+defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>;
+defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>;
+defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>;
+defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
+defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
+defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
+defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>;
+defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>;
+defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>;
+defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 95fcd4a..ce280d4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -557,11 +557,11 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
null_frag, 1>;
// Dot-iu instructions consider input as signed if imod neg bits are set. Thus
// Dot-iu Intrinsics have extra operands and require separate codegen pattern.
- def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0,
- (VOP3PModsNeg i32:$src1_mods), i32:$src1,
+ def : GCNPat < (intrinsic_node timm:$src0_mods, i32:$src0,
+ timm:$src1_mods, i32:$src1,
i32:$src2, (i1 timm:$clamp)),
- (!cast<Instruction>(NAME) $src0_mods, i32:$src0,
- $src1_mods, i32:$src1,
+ (!cast<Instruction>(NAME) (VOP3PModsNeg $src0_mods), i32:$src0,
+ (VOP3PModsNeg $src1_mods), i32:$src1,
(i32 8), i32:$src2, i1:$clamp)
>;
}
@@ -1302,11 +1302,11 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
GCNPat < (P.DstVT (node
- (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
- (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
+ timm:$src0_modifiers, (P.Src0VT P.Src0VT:$src0),
+ timm:$src1_modifiers, (P.Src1VT P.Src1VT:$src1),
(P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
)),
- (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
+ (P.DstVT (Inst (VOP3PModsNeg $src0_modifiers), P.Src0VT:$src0, (VOP3PModsNeg $src1_modifiers), P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
>;
class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
@@ -1407,9 +1407,9 @@ let WaveSizePredicate = isWave64 in {
}
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
- bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
- bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
- bit _IsF4 = 0>
+ bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+ bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0,
+ bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
@@ -1417,6 +1417,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
int IndexType = _IndexType;
let HasMatrixFMT = _HasMatrixFMT;
+ let HasMatrixScale = _HasMatrixScale;
+ bit Scale16 = _Scale16;
let HasMatrixReuse = _HasMatrixReuse;
bit HasIModOp = _Has_ImodOp;
@@ -1455,6 +1457,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsC_F16: "_f16",
IsC_BF16: "_bf16",
1: "_b32")));
+ ValueType ScaleTy = !if(Scale16, i64, i32);
// For f16 and bf16 matrices A and B, each element can be modified by
// fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but
@@ -1516,6 +1519,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
(ins));
+ dag MatrixScaleSrc = !if(HasMatrixScale,
+ !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
+ (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
+ (ins));
+ dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
+ MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
+ (ins));
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
@@ -1529,7 +1539,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
(ins VRegSrc_64:$src2),
(ins VRegSrc_32:$src2)),
IndexKey)),
- MatrixFMT, MatrixReuse, Clamp, Neg);
+ MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg);
// asm
@@ -1538,57 +1548,59 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 16) : "$index_key_16bit",
!eq(IndexType, 32) : "$index_key_32bit");
string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
+ string MatrixScaleSrcAsm = !if(HasMatrixScale, ", $scale_src0, $scale_src1", "");
+ string MatrixScaleAsm = !if(HasMatrixScale, "$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt", "");
string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
- let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixScaleSrcAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp));
bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp);
bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp);
- dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
- IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0),
+ dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
- IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ IsIU : (ins timm:$src0_modifiers, Src0VT:$src0),
HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
NoABMods : (ins Src0VT:$src0));
- dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
- IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0),
IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0),
IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0),
- IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsIU : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0),
NoABMods : (ins Src0VT:$src0));
- dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
- IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1),
+ dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
- IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ IsIU : (ins timm:$src1_modifiers, Src1VT:$src1),
HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
- dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
- IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src1_modifiers), Src1VT:$src1),
IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1),
IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1),
- IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
bit IsIUXF32 = !or(IsIU, IsXF32);
- dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2),
+ dag Src2InPatWmma = !cond(IsC_IMod1 : (ins timm:$src2_modifiers, Src2VT:$src2),
IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
IsC_BF16_IMod0 : (ins Src2VT:$src2),
IsIUXF32 : (ins Src2VT:$src2),
IsSWMMAC : (ins));
- dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2),
IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2),
@@ -1604,22 +1616,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
- dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
- dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
+ dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
+ dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2));
+ dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0,
+ timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1),
+ (ins));
dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins));
+ dag MatrixScaleOutSrcPat = !if(HasMatrixScale, (ins ScaleTy:$scale_src0, ScaleTy:$scale_src1), (ins));
+ dag MatrixScaleOutModPat = !if(HasMatrixScale, (ins i32:$matrix_a_scale, i32:$matrix_b_scale, i32:$matrix_a_scale_fmt, i32:$matrix_b_scale_fmt), (ins));
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
- dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat,
+ MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
- dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat,
+ MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
}
def WMMAInstInfoTable : GenericTable {
@@ -1645,11 +1664,15 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
+ let FixedSize = WMMAProfile.HasMatrixScale;
+ let Size = !if(WMMAProfile.HasMatrixScale, 16, 8);
}
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in
def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
+ let FixedSize = WMMAProfile.HasMatrixScale;
+ let Size = !if(WMMAProfile.HasMatrixScale, 16, 8);
}
}
@@ -1728,39 +1751,55 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
-def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
-def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
-def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
-def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
-def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
-def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
-def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
-def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
-def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
-def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
-def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
-def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
-def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
-def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
-def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
-def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
-def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
-def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
-def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
-
-multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
- def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
- def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
-}
-
-defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
+def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>;
+
+multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> {
+ def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+ def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+}
+
+defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>;
+defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>;
+defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>;
+
+class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> {
+ let HasMatrixScale = 1;
+ let HasMatrixReuse = 1;
+ let HasNeg = 0;
+ let Src0RC64 = RC;
+ let Src1RC64 = RC;
+ let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
+ MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt,
+ MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse);
+ let AsmVOP3P = " $src0, $src1$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt$matrix_a_reuse$matrix_b_reuse";
+}
multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
@@ -1813,9 +1852,15 @@ defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
+defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>;
} // End SubtargetPredicate = isGFX125xOnly
} // End WaveSizePredicate = isWave32
@@ -1970,9 +2015,13 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>;
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
+ defm : WMMAPat<"V_WMMA_SCALE_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE_" # I # "_w32")>;
+ defm : WMMAPat<"V_WMMA_SCALE16_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE16_" # I # "_w32")>;
}
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
@@ -2105,6 +2154,82 @@ multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
}
}
+class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VOP3Pe_Base {
+ bits<9> scale_src0;
+ bits<9> scale_src1;
+
+ // Inst{7-0} = unused
+ let Inst{10-8} = {0, matrix_b_scale_fmt{1-0}}; // neg_hi
+ let Inst{11} = matrix_a_scale{0}; // scale_op_sel(0)
+ let Inst{12} = 0; // scale_op_sel(1)
+ let Inst{13} = matrix_a_reuse; // scale_op_sel(2)
+ let Inst{14} = matrix_b_reuse; // scale_op_sel_hi(2)
+ let Inst{15} = 0; // scale_clamp
+ let Inst{31-24} = 0xcc; // Encoding
+ let Inst{23-16} = LdScaleOp;
+ let Inst{40-32} = scale_src0;
+ let Inst{49-41} = scale_src1;
+ let Inst{58-50} = 0; // scale src2
+ let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0)
+ let Inst{60} = 0; // scale_op_sel_hi(1)
+ let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
+
+ // The high half of the encoding is the unscaled wmma op.
+ let Inst{71-64} = vdst;
+
+ let Inst{72} = !if(P.NegHi01, src0_modifiers{1}, 0); // neg_hi src0
+ let Inst{73} = !if(P.NegHi01, src1_modifiers{1}, 0); // neg_hi src1
+ let Inst{74} = !if(P.NegHi2, src2_modifiers{1}, 0); // neg_hi src2
+
+ let Inst{77-75} = !if(P.HasMatrixFMT, matrix_a_fmt{2-0}, 0); // op_sel
+
+ let Inst{78,124,123} = !if(P.HasMatrixFMT, matrix_b_fmt{2-0}, 7); // op_sel_hi
+ let Inst{79} = !if(P.HasClamp, clamp{0}, 0);
+
+ let Inst{87-80} = op;
+ let Inst{95-88} = 0xcc; //encoding
+ let Inst{104-96} = !if(P.HasSrc0, src0, 0);
+ let Inst{113-105} = !if(P.HasSrc1, src1, 0);
+ let Inst{122-114} = !if(P.HasSrc2, src2, 0);
+
+ // neg_lo
+ let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0);
+ let Inst{126} = !if(P.NegLo01, src1_modifiers{0}, 0);
+ let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
+}
+
+multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+ DecoderNamespace = "GFX1250" in {
+ def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
+ VOP3PX2e <op, LdScaleOp, WMMAP>;
+ }
+}
+
+multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+ DecoderNamespace = "GFX1250" in {
+ def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>,
+ VOP3PX2e <op, LdScaleOp, WMMAP>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> {
+ let AsmString = asmName # PS.AsmOperands;
+ }
+ }
+}
+
+multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> {
+ defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+ foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+ defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ }
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -2180,6 +2305,11 @@ defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8B
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
+
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
@@ -2283,6 +2413,9 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+
let AssemblerPredicate = isGFX1250Plus in
def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f027ab0..3cad5a1 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -475,17 +475,24 @@ class VOP3Pe_Base {
bits<1> index_key_32bit;
bits<3> matrix_a_fmt;
bits<3> matrix_b_fmt;
+ bits<1> matrix_a_scale;
+ bits<1> matrix_b_scale;
+ bits<2> matrix_a_scale_fmt;
+ bits<2> matrix_b_scale_fmt;
bits<1> matrix_a_reuse;
bits<1> matrix_b_reuse;
}
class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
let Inst{7-0} = !if(P.HasDst, vdst, 0);
- let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
- let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1},
+ !if(P.HasMatrixScale, matrix_b_scale_fmt{0}, 0)); // neg_hi src0
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1},
+ !if(P.HasMatrixScale, matrix_b_scale_fmt{1}, 0)); // neg_hi src1
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
- let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2},
+ !if(P.HasMatrixScale, matrix_a_scale{0}, 0)); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2},
!if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2)
@@ -500,10 +507,17 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
- let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
- let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+ let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3},
+ P.IsDOT : 1,
+ P.HasMatrixScale : matrix_b_scale{0},
+ 1: ?); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3},
+ !if(P.HasMatrixScale, 0,
+ !if(P.IsDOT, 1, ?))); // op_sel_hi(1)
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0},
+ !if(P.HasMatrixScale, matrix_a_scale_fmt{0}, 0)); // neg (lo)
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0},
+ !if(P.HasMatrixScale, matrix_a_scale_fmt{1}, 0)); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9366256..74c7c97 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -669,13 +669,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
// Integer division functions
// RTABI chapter 4.3.1
- { RTLIB::SDIV_I8, RTLIB::__aeabi_idiv__i8 },
- { RTLIB::SDIV_I16, RTLIB::__aeabi_idiv__i16 },
- { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv__i32},
+ { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv },
{ RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod },
- { RTLIB::UDIV_I8, RTLIB::__aeabi_uidiv__i8 },
- { RTLIB::UDIV_I16, RTLIB::__aeabi_uidiv__i16 },
- { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv__i32 },
+ { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv },
{ RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod },
};
// clang-format on
@@ -741,7 +737,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
{RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
- {RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h},
{RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
};
@@ -21363,7 +21358,9 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
- if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
return TargetLowering::insertSSPDeclarations(M);
// MSVC CRT has a global variable holding security cookie.
@@ -21372,23 +21369,32 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- "__security_check_cookie", Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
+ getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
F->addParamAttr(0, Attribute::AttrKind::InReg);
}
Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
- // MSVC CRT has a global variable holding security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported) {
+ // MSVC CRT has a global variable holding security cookie.
+ //
+ // FIXME: We have a libcall entry for the correlated check function, but not
+ // the global name.
return M.getGlobalVariable("__security_cookie");
+ }
+
return TargetLowering::getSDagStackGuard(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction("__security_check_cookie");
+ RTLIB::LibcallImpl SecurityCheckCookie =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookie != RTLIB::Unsupported)
+ return M.getFunction(getLibcallImplName(SecurityCheckCookie));
return TargetLowering::getSSPStackGuardCheck(M);
}
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index e8d0d35..fedf9e2 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -121,10 +121,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return std::make_unique<ARMElfTargetObjectFile>();
}
-static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+static std::string computeDataLayout(const Triple &TT,
const TargetOptions &Options,
bool isLittle) {
- auto ABI = ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName);
+ auto ABI = ARM::computeTargetABI(TT, Options.MCOptions.ABIName);
std::string Ret;
if (isLittle)
@@ -202,11 +202,10 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool isLittle)
- : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options, isLittle),
- TT, CPU, FS, Options,
- getEffectiveRelocModel(TT, RM),
+ : CodeGenTargetMachineImpl(T, computeDataLayout(TT, Options, isLittle), TT,
+ CPU, FS, Options, getEffectiveRelocModel(TT, RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- TargetABI(ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName)),
+ TargetABI(ARM::computeTargetABI(TT, Options.MCOptions.ABIName)),
TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
// Default to triple-appropriate float ABI
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index dfa3de3c..cc1c79b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -296,9 +296,9 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym,
unsigned FixupKind) {
// Create relocations for unconditional branches to function symbols with
// different execution mode in ELF binaries.
- if (!Sym || !Sym->isELF())
+ if (!Sym || !Asm.getContext().isELF())
return false;
- unsigned Type = cast<MCSymbolELF>(Sym)->getType();
+ unsigned Type = static_cast<const MCSymbolELF *>(Sym)->getType();
if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
return true;
@@ -1108,9 +1108,8 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F,
}
void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -1124,14 +1123,15 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
return; // Doesn't change encoding.
const unsigned NumBytes = getFixupKindNumBytes(Kind);
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FullSizeBytes;
if (Endian == llvm::endianness::big) {
FullSizeBytes = getFixupKindContainerSizeBytes(Kind);
- assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
+ assert(Fixup.getOffset() + FullSizeBytes <= F.getSize() &&
+ "Invalid fixup size!");
assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
}
@@ -1141,7 +1141,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx =
Endian == llvm::endianness::little ? i : (FullSizeBytes - 1 - i);
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 07d2cf7..2844232 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -40,8 +40,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 50e9ca1..d914f6e 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -97,8 +97,8 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
case ARM::S_TLSLDM_FDPIC:
case ARM::S_TLSLDO:
case ARM::S_TPOFF:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6dfe846..0796746 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -614,7 +614,7 @@ public:
if (!IsThumb)
return Val;
- unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+ unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType();
if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) &&
Symbol->isDefined())
getAssembler().setIsThumbFunc(Symbol);
@@ -679,7 +679,8 @@ private:
}
void EmitMappingSymbol(StringRef Name) {
- auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+ auto *Symbol =
+ static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name));
emitLabel(Symbol);
Symbol->setType(ELF::STT_NOTYPE);
@@ -687,7 +688,8 @@ private:
}
void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) {
- auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+ auto *Symbol =
+ static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name));
emitLabelAtPos(Symbol, SMLoc(), F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
@@ -1088,7 +1090,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
return;
Streamer.getAssembler().registerSymbol(*Symbol);
- unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+ unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType();
if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
emitThumbFunc(Symbol);
}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 354de8f..8ee3a2d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -505,7 +505,7 @@ public:
// Remember that the function is a thumb function. Fixup and relocation
// values will need adjusted.
getStreamer().getAssembler().setIsThumbFunc(Symbol);
- cast<MCSymbolMachO>(Symbol)->setThumbFunc();
+ static_cast<MCSymbolMachO *>(Symbol)->setThumbFunc();
}
};
} // namespace
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 38444f9..05a7d03 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -368,9 +368,8 @@ AVRAsmBackend::createObjectTargetWriter() const {
}
void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// AVR sets the fixup value to bypass the assembly time overflow with a
// relocation.
if (IsResolved) {
@@ -397,14 +396,14 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i < NumBytes; ++i) {
uint8_t mask = (((Value >> (i * 8)) & 0xff));
- Data[Offset + i] |= mask;
+ Data[i] |= mask;
}
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 68c839e..9633669 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -38,8 +38,7 @@ public:
createObjectTargetWriter() const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index dda8753..53933f9 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,7 @@ public:
~BPFAsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -66,35 +65,32 @@ bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
}
void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
if (Fixup.getKind() == FK_SecRel_8) {
// The Value is 0 for global variables, and the in-section offset
// for static variables. Write to the immediate field of the inst.
assert(Value <= UINT32_MAX);
- support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4],
- static_cast<uint32_t>(Value),
+ support::endian::write<uint32_t>(Data + 4, static_cast<uint32_t>(Value),
Endian);
} else if (Fixup.getKind() == FK_Data_4 && !Fixup.isPCRel()) {
- support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
+ support::endian::write<uint32_t>(Data, Value, Endian);
} else if (Fixup.getKind() == FK_Data_8) {
- support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian);
+ support::endian::write<uint64_t>(Data, Value, Endian);
} else if (Fixup.getKind() == FK_Data_4 && Fixup.isPCRel()) {
Value = (uint32_t)((Value - 8) / 8);
if (Endian == llvm::endianness::little) {
- Data[Fixup.getOffset() + 1] = 0x10;
- support::endian::write32le(&Data[Fixup.getOffset() + 4], Value);
+ Data[1] = 0x10;
+ support::endian::write32le(Data + 4, Value);
} else {
- Data[Fixup.getOffset() + 1] = 0x1;
- support::endian::write32be(&Data[Fixup.getOffset() + 4], Value);
+ Data[1] = 0x1;
+ support::endian::write32be(Data + 4, Value);
}
} else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) {
// The input Value represents the number of bytes.
Value = (uint32_t)((Value - 8) / 8);
- support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value,
- Endian);
+ support::endian::write<uint32_t>(Data + 4, Value, Endian);
} else {
assert(Fixup.getKind() == FK_Data_2 && Fixup.isPCRel());
@@ -103,8 +99,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
report_fatal_error("Branch target out of insn range");
Value = (uint16_t)((Value - 8) / 8);
- support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
- Endian);
+ support::endian::write<uint16_t>(Data + 2, Value, Endian);
}
}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index 1bd82fad..6964998 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -197,9 +197,8 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F,
}
void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -217,10 +216,10 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
@@ -228,14 +227,14 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
bool IsInstFixup = (Kind >= FirstTargetFixupKind);
if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) {
- Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff);
- Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff);
- Data[Offset + 2] |= uint8_t(Value & 0xff);
- Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff);
+ Data[0] |= uint8_t((Value >> 16) & 0xff);
+ Data[1] |= uint8_t((Value >> 24) & 0xff);
+ Data[2] |= uint8_t(Value & 0xff);
+ Data[3] |= uint8_t((Value >> 8) & 0xff);
} else {
for (unsigned I = 0; I != NumBytes; I++) {
unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I);
- Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (I * 8)) & 0xff);
}
}
}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 1c8516f..5d8826a 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -25,8 +25,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
index d042d26..4667975f 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -48,8 +48,8 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup,
case CSKY::S_TLSGD:
case CSKY::S_TLSLDM:
case CSKY::S_TLSLDO:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
index 346b123..397cf16 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
@@ -169,7 +169,8 @@ void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) {
State = (Name == "$t" ? EMS_Text : EMS_Data);
- auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+ auto *Symbol =
+ static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name));
emitLabel(Symbol);
Symbol->setType(ELF::STT_NOTYPE);
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index b6e8ce7..26a113d 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -103,7 +103,7 @@ GlobalVariable *DXContainerGlobals::computeShaderHash(Module &M) {
dxbc::ShaderHash HashData = {0, {0}};
// The Hash's IncludesSource flag gets set whenever the hashed shader includes
// debug information.
- if (M.debug_compile_units_begin() != M.debug_compile_units_end())
+ if (!M.debug_compile_units().empty())
HashData.Flags = static_cast<uint32_t>(dxbc::HashFlags::IncludesSource);
memcpy(reinterpret_cast<void *>(&HashData.Digest), Result.data(), 16);
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 5323be6..9a14c01 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -78,8 +78,7 @@ public:
~DXILAsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override {}
+ uint8_t *Data, uint64_t Value, bool IsResolved) override {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 102f1c6..14b6bb3 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -330,7 +330,7 @@ bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
if (!GepI->getType()->isPointerTy())
return false;
// No GEPs without any indices. (Is this possible?)
- if (GepI->idx_begin() == GepI->idx_end())
+ if (GepI->indices().empty())
return false;
return true;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 52fa678..613048b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1987,7 +1987,7 @@ SmallVector<uint32_t, 8> HvxSelector::getPerfectCompletions(ShuffleMask SM,
// times). In such cases it will be impossible to complete this to a
// perfect shuffle.
SmallVector<uint32_t, 8> Sorted(Worklist);
- llvm::sort(Sorted.begin(), Sorted.end());
+ llvm::sort(Sorted);
for (unsigned I = 0, E = Sorted.size(); I != E;) {
unsigned P = Sorted[I], Count = 1;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index d5b7a75..1a0f1ab 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -402,8 +402,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char> Data, uint64_t FixupValue,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t FixupValue, bool IsResolved) override;
bool isInstRelaxable(MCInst const &HMI) const {
const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI);
@@ -649,8 +648,7 @@ public:
} // namespace
void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *InstAddr,
uint64_t FixupValue, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup))
IsResolved = false;
@@ -667,10 +665,9 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// LLVM gives us an encoded value, we have to convert it back
// to a real offset before we can use it.
- uint32_t Offset = Fixup.getOffset();
unsigned NumBytes = getFixupKindNumBytes(Kind);
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
- char *InstAddr = Data.data() + Offset;
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
Value = adjustFixupValue(Kind, FixupValue);
if (!Value)
@@ -757,8 +754,8 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |=
(InstAddr[i] << (i * 8)) & (0xff << (i * 8));
dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x";
- dbgs().write_hex(FixupValue)
- << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x";
+ dbgs().write_hex(FixupValue) << ": Offset=" << Fixup.getOffset()
+ << ": Size=" << F.getSize() << ": OInst=0x";
dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc););
// For each byte of the fragment that the fixup touches, mask in the
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index 9752f3a..af97ea2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -50,8 +50,8 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup,
case HexagonMCExpr::VK_IE:
case HexagonMCExpr::VK_IE_GOT:
case HexagonMCExpr::VK_TPREL:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 13ecc23..039ef4f 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -96,7 +96,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
getAssembler().registerSymbol(*Symbol);
StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"};
- auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+ auto ELFSymbol = static_cast<MCSymbolELF *>(Symbol);
if (!ELFSymbol->isBindingSet())
ELFSymbol->setBinding(ELF::STB_GLOBAL);
@@ -143,7 +143,7 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
Align ByteAlignment,
unsigned AccessSize) {
getAssembler().registerSymbol(*Symbol);
- auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+ auto ELFSymbol = static_cast<const MCSymbolELF *>(Symbol);
ELFSymbol->setBinding(ELF::STB_LOCAL);
ELFSymbol->setExternal(false);
HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 83d1697..3112dea 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -48,8 +48,7 @@ public:
: MCAsmBackend(llvm::endianness::big), OSType(OST) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -72,9 +71,8 @@ bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
}
void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
@@ -85,7 +83,6 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Where in the object and where the number of bytes that need
// fixing up
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
unsigned FullSize = 4;
@@ -95,8 +92,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Load instruction and apply value
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = (FullSize - 1 - i);
- CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx]))
- << (i * 8);
+ CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Idx])) << (i * 8);
}
uint64_t Mask =
@@ -106,7 +102,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Write out the fixed up bytes back to the code/data bits.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = (FullSize - 1 - i);
- Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
+ Data[Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5096a8f..d8bb16f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1651,20 +1651,19 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
- (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
- (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
- (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
-def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
- (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj),
+ uimm3:$imm),
+ (XVINSGR2VR_W v8f32:$xd, GPR:$rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64(bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D v4f64:$xd, GPR:$rj, uimm2:$imm)>;
// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
- (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
+ (XVINSVE0_W v8f32:$xd, (SUBREG_TO_REG(i64 0), FPR32:$fj, sub_32),
+ uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
- (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
+ (XVINSVE0_D v4f64:$xd, (SUBREG_TO_REG(i64 0), FPR64:$fj, sub_64),
+ uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1884,10 +1883,10 @@ def : Pat<(i64 (vector_extract v8i32:$xj, uimm3:$imm)),
(XVPICKVE2GR_W v8i32:$xj, uimm3:$imm)>;
def : Pat<(i64 (vector_extract v4i64:$xj, uimm2:$imm)),
(XVPICKVE2GR_D v4i64:$xj, uimm2:$imm)>;
-def : Pat<(f32 (vector_extract v8f32:$xj, uimm3:$imm)),
- (MOVGR2FR_W (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm))>;
-def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)),
- (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>;
+def : Pat<(f32(vector_extract v8f32:$xj, uimm3:$imm)),
+ (EXTRACT_SUBREG(XVPICKVE_W v8f32:$xj, uimm3:$imm), sub_32)>;
+def : Pat<(f64(vector_extract v4f64:$xj, uimm2:$imm)),
+ (EXTRACT_SUBREG(XVPICKVE_D v4f64:$xj, uimm2:$imm), sub_64)>;
// vselect
def : Pat<(v32i8 (vselect LASX256:$xd, (v32i8 (SplatPat_uimm8 uimm8:$imm)),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 858f3d0..fda9d97 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -131,19 +131,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup,
- MutableArrayRef<char> Data, uint64_t Value) {
+static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data,
+ uint64_t Value) {
unsigned I;
- for (I = 0; I != Data.size() && Value; ++I, Value >>= 7)
+ for (I = 0; Value; ++I, Value >>= 7)
Data[I] |= uint8_t(Value & 0x7f);
if (Value)
Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!");
}
void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (IsResolved && shouldForceRelocation(Fixup, Target))
IsResolved = false;
IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
@@ -166,14 +165,14 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned I = 0; I != NumBytes; ++I) {
- Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[I] |= uint8_t((Value >> (I * 8)) & 0xff);
}
}
@@ -274,15 +273,14 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
int64_t LineDelta = F.getDwarfLineDelta();
const MCExpr &AddrDelta = F.getDwarfAddrDelta();
- SmallVector<MCFixup, 1> Fixups;
size_t OldSize = F.getVarSize();
int64_t Value;
if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
return false;
- bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm);
- assert(IsAbsolute && "CFA with invalid expression");
- (void)IsAbsolute;
+ [[maybe_unused]] bool IsAbsolute =
+ AddrDelta.evaluateKnownAbsolute(Value, *Asm);
+ assert(IsAbsolute);
SmallVector<char> Data;
raw_svector_ostream OS(Data);
@@ -293,33 +291,23 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
encodeSLEB128(LineDelta, OS);
}
- unsigned Offset;
- std::pair<MCFixupKind, MCFixupKind> FK;
-
// According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
// takes a single unsigned half (unencoded) operand. The maximum encodable
// value is therefore 65535. Set a conservative upper bound for relaxation.
+ unsigned PCBytes;
if (Value > 60000) {
unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
-
- OS << uint8_t(dwarf::DW_LNS_extended_op);
- encodeULEB128(PtrSize + 1, OS);
-
- OS << uint8_t(dwarf::DW_LNE_set_address);
- Offset = OS.tell();
assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size");
- FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64);
+ PCBytes = PtrSize;
+ OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PtrSize + 1)
+ << uint8_t(dwarf::DW_LNE_set_address);
OS.write_zeros(PtrSize);
} else {
+ PCBytes = 2;
OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
- Offset = OS.tell();
- FK = getRelocPairForSize(16);
support::endian::write<uint16_t>(OS, 0, llvm::endianness::little);
}
-
- const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
- Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK)));
- Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK)));
+ auto Offset = OS.tell() - PCBytes;
if (LineDelta == INT64_MAX) {
OS << uint8_t(dwarf::DW_LNS_extended_op);
@@ -330,7 +318,8 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
}
F.setVarContents(Data);
- F.setVarFixups(Fixups);
+ F.setVarFixups({MCFixup::create(Offset, &AddrDelta,
+ MCFixup::getDataKindForSize(PCBytes))});
WasRelaxed = OldSize != Data.size();
return true;
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 3d929fc..1f13601 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -42,8 +42,7 @@ public:
uint64_t &FixedValue, bool IsResolved);
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index fb741af..7e021e4 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -61,8 +61,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
case ELF::R_LARCH_TLS_LD_PCREL20_S2:
case ELF::R_LARCH_TLS_GD_PCREL20_S2:
case ELF::R_LARCH_TLS_DESC_PCREL20_S2:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index 7ef705d..fe83dc6 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -53,8 +53,7 @@ public:
.Default(false)) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override;
@@ -78,9 +77,8 @@ public:
} // end anonymous namespace
void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
@@ -95,8 +93,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Write in Big Endian
for (unsigned i = 0; i != Size; ++i)
- Data[Fixup.getOffset() + i] =
- uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8));
+ Data[i] = uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8));
}
/// cc—Carry clear GE—Greater than or equal
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index ca94a47..d070409 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -70,8 +70,8 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
case M68k::S_TLSLD:
case M68k::S_TLSLDM:
case M68k::S_TPOFF:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index b513503..d892b3a 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -36,8 +36,7 @@ public:
~MSP430AsmBackend() override = default;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
@@ -105,9 +104,8 @@ uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup,
}
void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
Value = adjustFixupValue(Fixup, Value, getContext());
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
@@ -117,15 +115,14 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
-
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 259b71b..7b2ee83 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2948,8 +2948,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
bool IsPtr64 = ABI.ArePtrs64bit();
bool IsLocalSym =
Res.getAddSym()->isInSection() || Res.getAddSym()->isTemporary() ||
- (Res.getAddSym()->isELF() &&
- cast<MCSymbolELF>(Res.getAddSym())->getBinding() == ELF::STB_LOCAL);
+ (getContext().isELF() &&
+ static_cast<const MCSymbolELF *>(Res.getAddSym())->getBinding() ==
+ ELF::STB_LOCAL);
// For O32, "$"-prefixed symbols are recognized as temporary while
// .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L"
// manually.
@@ -6653,7 +6654,7 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
llvm_unreachable("Should never fail");
}
}
- } else if (Sym->isUnset()) {
+ } else if (Sym->isUndefined()) {
// If symbol is unset, it might be created in the `parseSetAssignment`
// routine as an alias for a numeric register name.
// Lookup in the aliases list.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index c2169be..33aab71 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -283,9 +283,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
/// data fragment, at the offset specified by the fixup and following the
/// fixup kind as appropriate.
void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (shouldForceRelocation(Fixup))
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
@@ -297,7 +296,6 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
return; // Doesn't change encoding.
// Where do we start in the object
- unsigned Offset = Fixup.getOffset();
// Number of bytes we need to fixup
unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
// Used to point to big endian bytes
@@ -328,7 +326,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
unsigned Idx = Endian == llvm::endianness::little
? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
: (FullSize - 1 - i);
- CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+ CurVal |= (uint64_t)((uint8_t)Data[Idx]) << (i * 8);
}
uint64_t Mask = ((uint64_t)(-1) >>
@@ -340,7 +338,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
unsigned Idx = Endian == llvm::endianness::little
? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i)
: (FullSize - 1 - i);
- Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+ Data[Idx] = (uint8_t)((CurVal >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 816626d..40b5853 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -40,8 +40,7 @@ public:
createObjectTargetWriter() const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 7abe9c9..16247bd 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -166,8 +166,8 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup,
case Mips::S_GOTTPREL:
case Mips::S_TPREL_HI:
case Mips::S_TPREL_LO:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
@@ -450,6 +450,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
needsRelocateWithSymbol(V, (Type >> 8) & 0xff) ||
needsRelocateWithSymbol(V, (Type >> 16) & 0xff);
+ auto *Sym = static_cast<const MCSymbolELF *>(V.getAddSym());
switch (Type) {
default:
errs() << Type << "\n";
@@ -481,7 +482,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
// FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but
// we neglect to handle the adjustment to the LSB of the addend that
// it causes in applyFixup() and similar.
- if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS)
+ if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS)
return true;
return false;
@@ -492,7 +493,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
case ELF::R_MIPS_16:
case ELF::R_MIPS_32:
case ELF::R_MIPS_GPREL32:
- if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS)
+ if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS)
return true;
[[fallthrough]];
case ELF::R_MIPS_26:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e8b9746..feeadc5e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -76,7 +76,7 @@ void MipsELFStreamer::createPendingLabelRelocs() {
// FIXME: Also mark labels when in MIPS16 mode.
if (ELFTargetStreamer->isMicroMipsEnabled()) {
for (auto *L : Labels) {
- auto *Label = cast<MCSymbolELF>(L);
+ auto *Label = static_cast<MCSymbolELF *>(L);
getAssembler().registerSymbol(*Label);
Label->setOther(ELF::STO_MIPS_MICROMIPS);
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index d9680c7..5df70c4 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -931,7 +931,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
}
void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
- auto *Symbol = cast<MCSymbolELF>(S);
+ auto *Symbol = static_cast<MCSymbolELF *>(S);
getStreamer().getAssembler().registerSymbol(*Symbol);
uint8_t Type = Symbol->getType();
if (Type != ELF::STT_FUNC)
@@ -1015,11 +1015,11 @@ void MipsTargetELFStreamer::finish() {
}
void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) {
- auto *Symbol = cast<MCSymbolELF>(S);
+ auto *Symbol = static_cast<MCSymbolELF *>(S);
// If on rhs is micromips symbol then mark Symbol as microMips.
if (Value->getKind() != MCExpr::SymbolRef)
return;
- const auto &RhsSym = cast<MCSymbolELF>(
+ auto &RhsSym = static_cast<const MCSymbolELF &>(
static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS))
@@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
// fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
S.addFixup(Value, Mips::fixup_Mips_GPREL32);
S.appendContents(8, 0);
@@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(4);
S.addFixup(Value, Mips::fixup_Mips_TPREL32);
S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
auto &S = getStreamer();
+ S.ensureHeadroom(8);
S.addFixup(Value, Mips::fixup_Mips_TPREL64);
S.appendContents(8, 0);
}
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index a2e48ab..4530fc6 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1052,8 +1052,7 @@ void MipsAsmPrinter::EmitFPCallStub(
// __call_stub_fp_xxxx:
//
std::string x = "__call_stub_fp_" + std::string(Symbol);
- MCSymbolELF *Stub =
- cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x)));
+ MCSymbol *Stub = OutContext.getOrCreateSymbol(StringRef(x));
TS.emitDirectiveEnt(*Stub);
MCSymbol *MType =
OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2ae7520..aac611d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -151,6 +151,8 @@ class OneUse2<SDPatternOperator operator>
class fpimm_pos_inf<ValueType vt>
: FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>;
+class zeroinitializer<ValueType vt> :
+ PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>;
// Operands which can hold a Register or an Immediate.
@@ -789,6 +791,23 @@ def UMAX16x2 : I16x2<"max.u", umax>;
def SMIN16x2 : I16x2<"min.s", smin>;
def UMIN16x2 : I16x2<"min.u", umin>;
+let Predicates = [hasPTX<80>, hasSM<90>] in {
+
+ def MIN_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
+ "min.relu.s32",
+ [(set i32:$dst, (smax (smin i32:$a, i32:$b), 0))]>;
+ def MAX_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
+ "max.relu.s32",
+ [(set i32:$dst, (smax (smax i32:$a, i32:$b), 0))]>;
+ def MIN_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
+ "min.relu.s16x2",
+ [(set v2i16:$dst, (smax (smin v2i16:$a, v2i16:$b),
+ zeroinitializer<v2i16>))]>;
+ def MAX_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
+ "max.relu.s16x2",
+ [(set v2i16:$dst, (smax (smax v2i16:$a, v2i16:$b),
+ zeroinitializer<v2i16>))]>;
+}
//
// Wide multiplication
@@ -1541,18 +1560,6 @@ def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel
(PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
(cond2cc $cc))>;
-// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
-// comparison because we know that the truncate is just trancating off zeros
-// and that the most-significant byte is also zeros so the meaning of signed and
-// unsigned comparisons will not be changed.
-def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
- (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
- cond:$cc),
- (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
- (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
- (cond2cc $cc))>;
-
-
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParam :
@@ -2379,9 +2386,6 @@ def fpimm_any_zero : FPImmLeaf<fAny, [{
return Imm.isZero();
}]>;
-def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>;
-def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>;
-
// Perform substitution if fma only has one use, and also if instruction has
// nnan instruction flag or if the TM has NoNaNsFPMath
def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
@@ -2404,10 +2408,10 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat>
let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in {
def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>;
- def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, fpimm_positive_zero_v2f16>;
+ def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>;
}
let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in {
def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>;
- def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, fpimm_positive_zero_v2bf16>;
+ def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>;
}
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 58766b1..1fc475d 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1756,7 +1756,7 @@ bool PPCAsmParser::parseDirectiveLocalEntry(SMLoc L) {
if (getParser().parseIdentifier(Name))
return Error(L, "expected identifier in '.localentry' directive");
- MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name));
+ auto *Sym = static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(Name));
const MCExpr *Expr;
if (parseToken(AsmToken::Comma) ||
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 0e8828f..04b886a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -13,6 +13,7 @@
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCObjectWriter.h"
@@ -93,8 +94,8 @@ public:
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsResolved) override;
+ const MCValue &Target, uint8_t *Data, uint64_t Value,
+ bool IsResolved) override;
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) {
// If there is a @ specifier, unless it is optimized out (e.g. constant @l),
@@ -112,14 +113,15 @@ public:
// to resolve the fixup directly. Emit a relocation and leave
// resolution of the final target address to the linker.
if (const auto *A = Target.getAddSym()) {
- if (const auto *S = dyn_cast<MCSymbolELF>(A)) {
+ if (getContext().isELF()) {
// The "other" values are stored in the last 6 bits of the second
// byte. The traditional defines for STO values assume the full byte
// and thus the shift to pack it.
- unsigned Other = S->getOther() << 2;
+ unsigned Other = static_cast<const MCSymbolELF *>(A)->getOther() << 2;
if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
return true;
- } else if (const auto *S = dyn_cast<MCSymbolXCOFF>(A)) {
+ } else if (getContext().isXCOFF()) {
+ auto *S = static_cast<const MCSymbolXCOFF *>(A);
return !Target.isAbsolute() && S->isExternal() &&
S->getStorageClass() == XCOFF::C_WEAKEXT;
}
@@ -185,9 +187,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &TargetVal,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &TargetVal, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to
// reference the null symbol.
auto Target = TargetVal;
@@ -205,7 +206,6 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (!Value)
return; // Doesn't change encoding.
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = getFixupKindNumBytes(Kind);
// For each byte of the fragment that the fixup touches, mask in the bits
@@ -213,7 +213,7 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1 - i);
- Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index a5d3be4..329ad6e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -86,8 +86,8 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
case PPC::S_TPREL_HIGHEST:
case PPC::S_TPREL_HIGHESTA:
case PPC::S_TPREL_LO:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
@@ -499,7 +499,8 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
// The "other" values are stored in the last 6 bits of the second byte.
// The traditional defines for STO values assume the full byte and thus
// the shift to pack it.
- unsigned Other = cast<MCSymbolELF>(V.getAddSym())->getOther() << 2;
+ unsigned Other =
+ static_cast<const MCSymbolELF *>(V.getAddSym())->getOther() << 2;
return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 2dbc31f..132d5a4 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -65,7 +65,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
MCFragment *InstructionFragment = getCurrentFragment();
SMLoc InstLoc = Inst.getLoc();
// Check if there was a last label emitted.
- if (LastLabel && !LastLabel->isUnset() && LastLabelLoc.isValid() &&
+ if (LastLabel && LastLabel->isDefined() && LastLabelLoc.isValid() &&
InstLoc.isValid()) {
const SourceMgr *SourceManager = getContext().getSourceManager();
unsigned InstLine = SourceManager->FindLineNumber(InstLoc);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 3dad0e8..d856c3f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -211,7 +211,7 @@ public:
: PPCTargetStreamer(S), OS(OS) {}
void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override {
- if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
+ if (getContext().isXCOFF()) {
MCSymbolXCOFF *TCSym =
static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly())
->getQualNameSymbol();
@@ -225,10 +225,10 @@ public:
if (Kind == PPC::S_AIX_TLSGD || Kind == PPC::S_AIX_TLSGDM ||
Kind == PPC::S_AIX_TLSIE || Kind == PPC::S_AIX_TLSLE ||
Kind == PPC::S_AIX_TLSLD || Kind == PPC::S_AIX_TLSML)
- OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@"
+ OS << "\t.tc " << TCSym->getName() << "," << S.getName() << "@"
<< getContext().getAsmInfo()->getSpecifierName(Kind) << '\n';
else
- OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
+ OS << "\t.tc " << TCSym->getName() << "," << S.getName() << '\n';
if (TCSym->hasRename())
Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName());
@@ -308,7 +308,7 @@ public:
}
void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
- auto *Symbol = cast<MCSymbolELF>(S);
+ auto *Symbol = static_cast<MCSymbolELF *>(S);
// When encoding an assignment to set symbol A to symbol B, also copy
// the st_other bits encoding the local entry point offset.
@@ -335,7 +335,7 @@ private:
auto *Ref = dyn_cast<const MCSymbolRefExpr>(S);
if (!Ref)
return false;
- const auto &RhsSym = cast<MCSymbolELF>(Ref->getSymbol());
+ auto &RhsSym = static_cast<const MCSymbolELF &>(Ref->getSymbol());
unsigned Other = D->getOther();
Other &= ~ELF::STO_PPC64_LOCAL_MASK;
Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ce1d51a..2ab2c14 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2155,7 +2155,8 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
PPCTargetStreamer *TS =
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
- TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
+ TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym),
+ LocalOffsetExp);
} else if (Subtarget->isUsingPCRelativeCalls()) {
// When generating the entry point for a function we have a few scenarios
// based on whether or not that function uses R2 and whether or not that
@@ -2182,7 +2183,7 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
MF->hasInlineAsm() || (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) {
PPCTargetStreamer *TS =
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
- TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym),
+ TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym),
MCConstantExpr::create(1, OutContext));
}
}
@@ -2766,7 +2767,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
if (GV->hasComdat())
report_fatal_error("COMDAT not yet supported by AIX.");
- MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
+ auto *GVSym = static_cast<MCSymbolXCOFF *>(getSymbol(GV));
if (GV->isDeclarationForLinker()) {
emitLinkage(GV, GVSym);
@@ -2859,7 +2860,7 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
MCSectionSubPair Current = OutStreamer->getCurrentSection();
// Emit function descriptor.
OutStreamer->switchSection(
- cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect());
+ static_cast<MCSymbolXCOFF *>(CurrentFnDescSym)->getRepresentedCsect());
// Emit aliasing label for function descriptor csect.
for (const GlobalAlias *Alias : GOAliasMap[&MF->getFunction()])
@@ -2994,7 +2995,8 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
SmallString<128> Name;
StringRef Prefix = ".";
Name += Prefix;
- Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName();
+ Name += static_cast<const MCSymbolXCOFF *>(I.first.first)
+ ->getSymbolTableName();
MCSymbol *S = OutContext.getOrCreateSymbol(Name);
TCEntry = static_cast<MCSectionXCOFF *>(
getObjFileLowering().getSectionForTOCEntry(S, TM));
@@ -3112,7 +3114,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
setCsectAlignment(&G);
std::optional<CodeModel::Model> OptionalCodeModel = G.getCodeModel();
if (OptionalCodeModel)
- setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&G)),
+ setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&G)),
*OptionalCodeModel);
}
@@ -3139,7 +3141,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
if (GVar) {
std::optional<CodeModel::Model> OptionalCodeModel = GVar->getCodeModel();
if (OptionalCodeModel)
- setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&Alias)),
+ setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&Alias)),
*OptionalCodeModel);
}
@@ -3190,8 +3192,8 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
case PPC::BL_NOP: {
const MachineOperand &MO = MI->getOperand(0);
if (MO.isSymbol()) {
- MCSymbolXCOFF *S =
- cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName()));
+ auto *S = static_cast<MCSymbolXCOFF *>(
+ OutContext.getOrCreateSymbol(MO.getSymbolName()));
ExtSymSDNodeSymbols.insert(S);
}
} break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f179873..30b5fd6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
- setHasMultipleConditionRegisters();
setJumpIsExpensive();
}
@@ -5540,8 +5539,8 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
const TargetMachine &TM = Subtarget.getTargetMachine();
const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
- MCSymbolXCOFF *S =
- cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));
+ auto *S =
+ static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(GV, TM));
MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
return DAG.getMCSymbol(S, PtrVT);
@@ -19856,3 +19855,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
}
+
+bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
+ return Subtarget.useCRBits();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 124c711..9755f0e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1207,6 +1207,8 @@ namespace llvm {
bool IsVarArg) const;
bool supportsTailCallFor(const CallBase *CB) const;
+ bool hasMultipleConditionRegisters(EVT VT) const override;
+
private:
struct ReuseLoadInfo {
SDValue Ptr;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 9538b20..95ec42f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -327,19 +327,19 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
- MCContext &C = getContext();
-
int64_t LineDelta = F.getDwarfLineDelta();
const MCExpr &AddrDelta = F.getDwarfAddrDelta();
- SmallVector<MCFixup, 1> Fixups;
size_t OldSize = F.getVarSize();
int64_t Value;
+ // If the label difference can be resolved, use the default handling, which
+ // utilizes a shorter special opcode.
+ if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
+ return false;
[[maybe_unused]] bool IsAbsolute =
AddrDelta.evaluateKnownAbsolute(Value, *Asm);
assert(IsAbsolute && "CFA with invalid expression");
- Fixups.clear();
SmallVector<char> Data;
raw_svector_ostream OS(Data);
@@ -349,33 +349,21 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
encodeSLEB128(LineDelta, OS);
}
- unsigned Offset;
- std::pair<MCFixupKind, MCFixupKind> Fixup;
-
// According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
// takes a single unsigned half (unencoded) operand. The maximum encodable
// value is therefore 65535. Set a conservative upper bound for relaxation.
+ unsigned PCBytes;
if (Value > 60000) {
- unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
-
- OS << uint8_t(dwarf::DW_LNS_extended_op);
- encodeULEB128(PtrSize + 1, OS);
-
- OS << uint8_t(dwarf::DW_LNE_set_address);
- Offset = OS.tell();
- assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size");
- Fixup = RISCV::getRelocPairForSize(PtrSize);
- OS.write_zeros(PtrSize);
+ PCBytes = getContext().getAsmInfo()->getCodePointerSize();
+ OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PCBytes + 1)
+ << uint8_t(dwarf::DW_LNE_set_address);
+ OS.write_zeros(PCBytes);
} else {
+ PCBytes = 2;
OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
- Offset = OS.tell();
- Fixup = RISCV::getRelocPairForSize(2);
support::endian::write<uint16_t>(OS, 0, llvm::endianness::little);
}
-
- const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
- Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(Fixup)));
- Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(Fixup)));
+ auto Offset = OS.tell() - PCBytes;
if (LineDelta == INT64_MAX) {
OS << uint8_t(dwarf::DW_LNS_extended_op);
@@ -386,7 +374,8 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
}
F.setVarContents(Data);
- F.setVarFixups(Fixups);
+ F.setVarFixups({MCFixup::create(Offset, &AddrDelta,
+ MCFixup::getDataKindForSize(PCBytes))});
WasRelaxed = OldSize != Data.size();
return true;
}
@@ -754,7 +743,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &,
if (!AUIPCTarget.getAddSym())
return false;
- const MCSymbolELF &SA = cast<MCSymbolELF>(*AUIPCTarget.getAddSym());
+ auto &SA = static_cast<const MCSymbolELF &>(*AUIPCTarget.getAddSym());
if (SA.isUndefined())
return false;
@@ -881,9 +870,8 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
}
void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
IsResolved = addReloc(F, Fixup, Target, Value, IsResolved);
MCFixupKind Kind = Fixup.getKind();
if (mc::isRelocation(Kind))
@@ -898,15 +886,14 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
-
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index d97d632..adec1ec 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -46,8 +46,7 @@ public:
void maybeAddVendorReloc(const MCFragment &, const MCFixup &);
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 9bf7896..2885e3c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -55,8 +55,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
case ELF::R_RISCV_TLSDESC_HI20:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
case ELF::R_RISCV_PLT32:
case ELF::R_RISCV_GOT32_PCREL:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index c654fd2b..543c4c5 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -117,7 +117,7 @@ void RISCVTargetELFStreamer::reset() {
void RISCVTargetELFStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) {
getStreamer().getAssembler().registerSymbol(Symbol);
- cast<MCSymbolELF>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC);
+ static_cast<MCSymbolELF &>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC);
}
void RISCVELFStreamer::reset() {
@@ -142,7 +142,8 @@ void RISCVELFStreamer::emitInstructionsMappingSymbol() {
}
void RISCVELFStreamer::emitMappingSymbol(StringRef Name) {
- auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+ auto *Symbol =
+ static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name));
emitLabel(Symbol);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index f816561c..98c8738 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -68,27 +68,6 @@ enum Fixups {
fixup_riscv_invalid,
NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind
};
-
-static inline std::pair<MCFixupKind, MCFixupKind>
-getRelocPairForSize(unsigned Size) {
- switch (Size) {
- default:
- llvm_unreachable("unsupported fixup size");
- case 1:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD8,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB8);
- case 2:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD16,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB16);
- case 4:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD32,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB32);
- case 8:
- return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD64,
- FirstLiteralRelocationKind + ELF::R_RISCV_SUB64);
- }
-}
-
} // end namespace llvm::RISCV
#endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 3655861..f70837e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -68,36 +68,30 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection(
const Triple &Triple = Ctx.getTargetTriple();
Align NoteAlign;
+ uint64_t DescSize;
if (Triple.isArch64Bit()) {
NoteAlign = Align(8);
+ DescSize = 16;
} else {
assert(Triple.isArch32Bit());
NoteAlign = Align(4);
+ DescSize = 12;
}
assert(Ctx.getObjectFileType() == MCContext::Environment::IsELF);
MCSection *const NoteSection =
Ctx.getELFSection(".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
- NoteSection->setAlignment(NoteAlign);
OutStreamer.pushSection();
OutStreamer.switchSection(NoteSection);
// Emit the note header
- OutStreamer.emitIntValue(4, 4); // n_namsz
-
- MCSymbol *const NDescBeginSym = Ctx.createTempSymbol();
- MCSymbol *const NDescEndSym = Ctx.createTempSymbol();
- const MCExpr *const NDescSzExpr =
- MCBinaryExpr::createSub(MCSymbolRefExpr::create(NDescEndSym, Ctx),
- MCSymbolRefExpr::create(NDescBeginSym, Ctx), Ctx);
-
- OutStreamer.emitValue(NDescSzExpr, 4); // n_descsz
+ OutStreamer.emitValueToAlignment(NoteAlign);
+ OutStreamer.emitIntValue(4, 4); // n_namsz
+ OutStreamer.emitIntValue(DescSize, 4); // n_descsz
OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); // n_type
OutStreamer.emitBytes(StringRef("GNU", 4)); // n_name
// Emit n_desc field
- OutStreamer.emitLabel(NDescBeginSym);
- OutStreamer.emitValueToAlignment(NoteAlign);
// Emit the feature_1_and property
OutStreamer.emitIntValue(ELF::GNU_PROPERTY_RISCV_FEATURE_1_AND, 4); // pr_type
@@ -105,7 +99,6 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection(
OutStreamer.emitIntValue(Feature1And, 4); // pr_data
OutStreamer.emitValueToAlignment(NoteAlign); // pr_padding
- OutStreamer.emitLabel(NDescEndSym);
OutStreamer.popSection();
}
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 82c0d8d..80a48c5 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -167,9 +167,8 @@ static std::pair<Value *, Value *> matchStridedStart(Value *Start,
default:
llvm_unreachable("Unexpected opcode");
case Instruction::Or:
- // TODO: We'd be better off creating disjoint or here, but we don't yet
- // have an IRBuilder API for that.
- [[fallthrough]];
+ Start = Builder.CreateOr(Start, Splat, "", /*IsDisjoint=*/true);
+ break;
case Instruction::Add:
Start = Builder.CreateAdd(Start, Splat);
break;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index adbfbeb..0077ecf 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -927,6 +927,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
ISD::EXTRACT_SUBVECTOR, ISD::SCALAR_TO_VECTOR},
@@ -1105,6 +1106,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -1181,6 +1183,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction(ISD::FNEG, VT, Expand);
setOperationAction(ISD::FABS, VT, Expand);
@@ -1352,6 +1355,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR,
ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV,
@@ -1442,6 +1446,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_SCATTER, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
@@ -7012,6 +7017,7 @@ static unsigned getRISCVVLOp(SDValue Op) {
OP_CASE(FDIV)
OP_CASE(FNEG)
OP_CASE(FABS)
+ OP_CASE(FCOPYSIGN)
OP_CASE(FSQRT)
OP_CASE(SMIN)
OP_CASE(SMAX)
@@ -7079,6 +7085,15 @@ static unsigned getRISCVVLOp(SDValue Op) {
if (Op.getSimpleValueType().getVectorElementType() == MVT::i1)
return RISCVISD::VMXOR_VL;
return RISCVISD::XOR_VL;
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return RISCVISD::VZEXT_VL;
+ case ISD::SIGN_EXTEND:
+ return RISCVISD::VSEXT_VL;
+ case ISD::SETCC:
+ return RISCVISD::SETCC_VL;
+ case ISD::VSELECT:
+ return RISCVISD::VMERGE_VL;
case ISD::VP_SELECT:
case ISD::VP_MERGE:
return RISCVISD::VMERGE_VL;
@@ -7419,12 +7434,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
if (Op.getOperand(0).getValueType().isVector() &&
Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1);
- return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VZEXT_VL);
+ if (Op.getValueType().isScalableVector())
+ return Op;
+ return lowerToScalableOp(Op, DAG);
case ISD::SIGN_EXTEND:
if (Op.getOperand(0).getValueType().isVector() &&
Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1);
- return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VSEXT_VL);
+ if (Op.getValueType().isScalableVector())
+ return Op;
+ return lowerToScalableOp(Op, DAG);
case ISD::SPLAT_VECTOR_PARTS:
return lowerSPLAT_VECTOR_PARTS(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
@@ -8103,6 +8122,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::MLOAD:
case ISD::VP_LOAD:
return lowerMaskedLoad(Op, DAG);
+ case ISD::VP_LOAD_FF:
+ return lowerLoadFF(Op, DAG);
case ISD::MSTORE:
case ISD::VP_STORE:
return lowerMaskedStore(Op, DAG);
@@ -8166,7 +8187,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget))
return SplitVectorOp(Op, DAG);
- return lowerFixedLengthVectorSetccToRVV(Op, DAG);
+ return lowerToScalableOp(Op, DAG);
}
case ISD::ADD:
case ISD::SUB:
@@ -8182,6 +8203,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::UREM:
case ISD::BSWAP:
case ISD::CTPOP:
+ case ISD::VSELECT:
return lowerToScalableOp(Op, DAG);
case ISD::SHL:
case ISD::SRA:
@@ -8250,14 +8272,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerToScalableOp(Op, DAG);
assert(Op.getOpcode() != ISD::CTTZ);
return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
- case ISD::VSELECT:
- return lowerFixedLengthVectorSelectToRVV(Op, DAG);
case ISD::FCOPYSIGN:
if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16)
return lowerFCOPYSIGN(Op, DAG, Subtarget);
if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
- return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
+ return lowerToScalableOp(Op, DAG);
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:
@@ -9694,33 +9714,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
return convertFromScalableVector(VecVT, Select, DAG, Subtarget);
}
-SDValue RISCVTargetLowering::lowerFixedLengthVectorExtendToRVV(
- SDValue Op, SelectionDAG &DAG, unsigned ExtendOpc) const {
- MVT ExtVT = Op.getSimpleValueType();
- // Only custom-lower extensions from fixed-length vector types.
- if (!ExtVT.isFixedLengthVector())
- return Op;
- MVT VT = Op.getOperand(0).getSimpleValueType();
- // Grab the canonical container type for the extended type. Infer the smaller
- // type from that to ensure the same number of vector elements, as we know
- // the LMUL will be sufficient to hold the smaller type.
- MVT ContainerExtVT = getContainerForFixedLengthVector(ExtVT);
- // Get the extended container type manually to ensure the same number of
- // vector elements between source and dest.
- MVT ContainerVT = MVT::getVectorVT(VT.getVectorElementType(),
- ContainerExtVT.getVectorElementCount());
-
- SDValue Op1 =
- convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget);
-
- SDLoc DL(Op);
- auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-
- SDValue Ext = DAG.getNode(ExtendOpc, DL, ContainerExtVT, Op1, Mask, VL);
-
- return convertFromScalableVector(ExtVT, Ext, DAG, Subtarget);
-}
-
// Custom-lower truncations from vectors to mask vectors by using a mask and a
// setcc operation:
// (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne)
@@ -12739,6 +12732,51 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
return DAG.getMergeValues({Result, Chain}, DL);
}
+SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op->getSimpleValueType(0);
+
+ const auto *VPLoadFF = cast<VPLoadFFSDNode>(Op);
+ EVT MemVT = VPLoadFF->getMemoryVT();
+ MachineMemOperand *MMO = VPLoadFF->getMemOperand();
+ SDValue Chain = VPLoadFF->getChain();
+ SDValue BasePtr = VPLoadFF->getBasePtr();
+
+ SDValue Mask = VPLoadFF->getMask();
+ SDValue VL = VPLoadFF->getVectorLength();
+
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ MVT MaskVT = getMaskTypeFor(ContainerVT);
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ }
+
+ unsigned IntID = Intrinsic::riscv_vleff_mask;
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(IntID, DL, XLenVT),
+ DAG.getUNDEF(ContainerVT),
+ BasePtr,
+ Mask,
+ VL,
+ DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT)};
+
+ SDVTList VTs = DAG.getVTList({ContainerVT, Op->getValueType(1), MVT::Other});
+
+ SDValue Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
+ SDValue OutVL = Result.getValue(1);
+ Chain = Result.getValue(2);
+
+ if (VT.isFixedLengthVector())
+ Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+ return DAG.getMergeValues({Result, OutVL, Chain}, DL);
+}
+
SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -12834,31 +12872,6 @@ SDValue RISCVTargetLowering::lowerVectorCompress(SDValue Op,
return Res;
}
-SDValue
-RISCVTargetLowering::lowerFixedLengthVectorSetccToRVV(SDValue Op,
- SelectionDAG &DAG) const {
- MVT InVT = Op.getOperand(0).getSimpleValueType();
- MVT ContainerVT = getContainerForFixedLengthVector(InVT);
-
- MVT VT = Op.getSimpleValueType();
-
- SDValue Op1 =
- convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget);
- SDValue Op2 =
- convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget);
-
- SDLoc DL(Op);
- auto [Mask, VL] = getDefaultVLOps(VT.getVectorNumElements(), ContainerVT, DL,
- DAG, Subtarget);
- MVT MaskVT = getMaskTypeFor(ContainerVT);
-
- SDValue Cmp =
- DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT,
- {Op1, Op2, Op.getOperand(2), DAG.getUNDEF(MaskVT), Mask, VL});
-
- return convertFromScalableVector(VT, Cmp, DAG, Subtarget);
-}
-
SDValue RISCVTargetLowering::lowerVectorStrictFSetcc(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
@@ -12985,51 +12998,6 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
return Max;
}
-SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- MVT VT = Op.getSimpleValueType();
- SDValue Mag = Op.getOperand(0);
- SDValue Sign = Op.getOperand(1);
- assert(Mag.getValueType() == Sign.getValueType() &&
- "Can only handle COPYSIGN with matching types.");
-
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
- Mag = convertToScalableVector(ContainerVT, Mag, DAG, Subtarget);
- Sign = convertToScalableVector(ContainerVT, Sign, DAG, Subtarget);
-
- auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-
- SDValue CopySign = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Mag,
- Sign, DAG.getUNDEF(ContainerVT), Mask, VL);
-
- return convertFromScalableVector(VT, CopySign, DAG, Subtarget);
-}
-
-SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
- SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getSimpleValueType();
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
-
- MVT I1ContainerVT =
- MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
-
- SDValue CC =
- convertToScalableVector(I1ContainerVT, Op.getOperand(0), DAG, Subtarget);
- SDValue Op1 =
- convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget);
- SDValue Op2 =
- convertToScalableVector(ContainerVT, Op.getOperand(2), DAG, Subtarget);
-
- SDLoc DL(Op);
- SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
-
- SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1,
- Op2, DAG.getUNDEF(ContainerVT), VL);
-
- return convertFromScalableVector(VT, Select, DAG, Subtarget);
-}
-
SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
SelectionDAG &DAG) const {
const auto &TSInfo =
@@ -13056,7 +13024,9 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
// "cast" fixed length vector to a scalable vector.
assert(useRVVForFixedLengthVectorVT(V.getSimpleValueType()) &&
"Only fixed length vectors are supported!");
- Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget));
+ MVT VContainerVT = ContainerVT.changeVectorElementType(
+ V.getSimpleValueType().getVectorElementType());
+ Ops.push_back(convertToScalableVector(VContainerVT, V, DAG, Subtarget));
}
SDLoc DL(Op);
@@ -21478,11 +21448,10 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode(
// TODO: Add more target nodes.
switch (Op.getOpcode()) {
case RISCVISD::SELECT_CC:
- // Integer select_cc cannot create poison.
- // TODO: What are the FP poison semantics?
- // TODO: This instruction blocks poison from the unselected operand, can
- // we do anything with that?
- return !Op.getValueType().isInteger();
+ // Integer comparisons cannot create poison.
+ assert(Op.getOperand(0).getValueType().isInteger() &&
+ "RISCVISD::SELECT_CC only compares integers");
+ return false;
}
return TargetLowering::canCreateUndefOrPoisonForTargetNode(
Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
@@ -22550,6 +22519,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
constexpr StringLiteral SupportedInterruptKinds[] = {
"machine",
"supervisor",
+ "rnmi",
"qci-nest",
"qci-nonest",
"SiFive-CLIC-preemptible",
@@ -22567,6 +22537,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
reportFatalUsageError(
"'SiFive-CLIC-*' interrupt kinds require XSfmclic extension");
+ if (Kind == "rnmi" && !Subtarget.hasStdExtSmrnmi())
+ reportFatalUsageError("'rnmi' interrupt kind requires Srnmi extension");
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
if (Kind.starts_with("SiFive-CLIC-preemptible") && TFI->hasFP(MF))
reportFatalUsageError("'SiFive-CLIC-preemptible' interrupt kinds cannot "
@@ -23212,7 +23184,11 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
if (Kind == "supervisor")
RetOpc = RISCVISD::SRET_GLUE;
- else if (Kind == "qci-nest" || Kind == "qci-nonest") {
+ else if (Kind == "rnmi") {
+ assert(STI.hasFeature(RISCV::FeatureStdExtSmrnmi) &&
+ "Need Smrnmi extension for rnmi");
+ RetOpc = RISCVISD::MNRET_GLUE;
+ } else if (Kind == "qci-nest" || Kind == "qci-nonest") {
assert(STI.hasFeature(RISCV::FeatureVendorXqciint) &&
"Need Xqciint for qci-(no)nest");
RetOpc = RISCVISD::QC_C_MILEAVERET_GLUE;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ca70c46..433b8be 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -526,6 +526,7 @@ private:
SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLoadFF(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVectorCompress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
@@ -534,9 +535,6 @@ private:
SDValue lowerMaskedScatter(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op,
- SelectionDAG &DAG) const;
SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const;
@@ -551,8 +549,6 @@ private:
SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVPStridedStore(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVPCttzElements(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG,
- unsigned ExtendOpc) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 6536078..8bd3830 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -75,6 +75,8 @@ def riscv_sret_glue : RVSDNode<"SRET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
def riscv_mret_glue : RVSDNode<"MRET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
+def riscv_mnret_glue : RVSDNode<"MNRET_GLUE", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
def riscv_mileaveret_glue : RVSDNode<"QC_C_MILEAVERET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
@@ -935,7 +937,6 @@ def MRET : Priv<"mret", 0b0011000>, Sched<[]> {
let rs1 = 0;
let rs2 = 0b00010;
}
-} // isBarrier = 1, isReturn = 1, isTerminator = 1
let Predicates = [HasStdExtSmrnmi] in {
def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> {
@@ -944,6 +945,8 @@ def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> {
let rs2 = 0b00010;
}
}// Predicates = [HasStdExtSmrnmi]
+} // isBarrier = 1, isReturn = 1, isTerminator = 1
+
def WFI : Priv<"wfi", 0b0001000>, Sched<[]> {
let rd = 0;
@@ -1801,6 +1804,8 @@ def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
def : Pat<(riscv_sret_glue), (SRET)>;
def : Pat<(riscv_mret_glue), (MRET)>;
+let Predicates = [HasStdExtSmrnmi] in
+def : Pat<(riscv_mnret_glue), (MNRET)>;
let isCall = 1, Defs = [X1] in {
let Predicates = [NoStdExtZicfilp] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 5265613..2c64b0c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -14,6 +14,14 @@
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
+def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 3>,
+ SDTCisPtrTy<2>,
+ SDTCisVT<3, XLenVT>]>;
+
+def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
def uimm5nonzero : RISCVOp<XLenVT>,
ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> {
let ParserMatchClass = UImmAsmOperand<5, "NonZero">;
@@ -27,6 +35,8 @@ def uimm5nonzero : RISCVOp<XLenVT>,
}];
}
+def tuimm5nonzero : TImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]>;
+
def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
[{return (Imm > 3) && isUInt<5>(Imm);}]> {
let ParserMatchClass = UImmAsmOperand<5, "GT3">;
@@ -92,6 +102,8 @@ def uimm5slist : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
}];
}
+def tuimm7_lsb00 : TImmLeaf<XLenVT,[{return isShiftedUInt<5, 2>(Imm);}]>;
+
def uimm10 : RISCVUImmLeafOp<10>;
def uimm11 : RISCVUImmLeafOp<11>;
@@ -457,6 +469,13 @@ class QCIRVInstRR<bits<5> funct5, DAGOperand InTyRs1, string opcodestr>
: RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
(ins InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+class QCIRVInstRRTied<bits<5> funct5, DAGOperand InTyRs1, string opcodestr>
+ : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+ (ins GPRNoX0:$rd, InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr,
+ "$rd, $rs1, $rs2"> {
+ let Constraints = "$rd = $rd_wb";
+}
+
class QCIBitManipRII<bits<3> funct3, bits<2> funct2,
DAGOperand InTyRs1, string opcodestr>
: RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
@@ -470,11 +489,26 @@ class QCIBitManipRII<bits<3> funct3, bits<2> funct2,
let Inst{24-20} = shamt;
}
+class QCIBitManipRIITied<bits<3> funct3, bits<2> funct2,
+ DAGOperand InTyRs1, string opcodestr>
+ : RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd,
+ InTyRs1:$rs1, uimm5_plus1:$width, uimm5:$shamt),
+ opcodestr, "$rd, $rs1, $width, $shamt"> {
+ let Constraints = "$rd = $rd_wb";
+ bits<5> shamt;
+ bits<5> width;
+
+ let Inst{31-30} = funct2;
+ let Inst{29-25} = width;
+ let Inst{24-20} = shamt;
+}
+
class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11,
string opcodestr>
- : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
- (ins GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr,
+ : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+ (ins GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr,
"$rd, $rs1, $imm11"> {
+ let Constraints = "$rd = $rd_wb";
bits<11> imm11;
let Inst{31-31} = funct1;
@@ -858,12 +892,12 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
let Inst{29-25} = width;
let Inst{24-20} = shamt;
}
- def QC_INSB : QCIBitManipRII<0b001, 0b01, GPR, "qc.insb">;
- def QC_INSBH : QCIBitManipRII<0b001, 0b10, GPR, "qc.insbh">;
- def QC_INSBR : QCIRVInstRR<0b00000, GPR, "qc.insbr">;
- def QC_INSBHR : QCIRVInstRR<0b00001, GPR, "qc.insbhr">;
- def QC_INSBPR : QCIRVInstRR<0b00010, GPR, "qc.insbpr">;
- def QC_INSBPRH : QCIRVInstRR<0b00011, GPR, "qc.insbprh">;
+ def QC_INSB : QCIBitManipRIITied<0b001, 0b01, GPR, "qc.insb">;
+ def QC_INSBH : QCIBitManipRIITied<0b001, 0b10, GPR, "qc.insbh">;
+ def QC_INSBR : QCIRVInstRRTied<0b00000, GPR, "qc.insbr">;
+ def QC_INSBHR : QCIRVInstRRTied<0b00001, GPR, "qc.insbhr">;
+ def QC_INSBPR : QCIRVInstRRTied<0b00010, GPR, "qc.insbpr">;
+ def QC_INSBPRH : QCIRVInstRRTied<0b00011, GPR, "qc.insbprh">;
def QC_EXTU : QCIBitManipRII<0b010, 0b00, GPRNoX0, "qc.extu">;
def QC_EXTDU : QCIBitManipRII<0b010, 0b10, GPRNoX31, "qc.extdu">;
def QC_EXTDUR : QCIRVInstRR<0b00100, GPRNoX31, "qc.extdur">;
@@ -1566,6 +1600,11 @@ def : QCISELECTIICCPat <SETEQ, QC_SELECTIIEQ>;
def : QCISELECTIICCPat <SETNE, QC_SELECTIINE>;
} // Predicates = [HasVendorXqcics, IsRV32]
+let Predicates = [HasVendorXqcilsm, IsRV32] in {
+def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
+ (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
+} // Predicates = [HasVendorXqcilsm, IsRV32]
+
//===----------------------------------------------------------------------===/i
// Compress Instruction tablegen backend.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index d2a6514..04ffb05 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -641,13 +641,15 @@ def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
let Predicates = [HasStdExtZbkb, IsRV32] in {
def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (or
- (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+
+// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
+// bits [15:0] coming from a zero extended value. We can use pack with packh for
+// bits [31:16]. If bits [15:0] can also be a packh, it can be matched
+// separately.
+def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
(shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
- (or
- (shl (zexti8 (XLenVT GPR:$op0rs2)), (XLenVT 8)),
- (zexti8 (XLenVT GPR:$op0rs1)))),
- (PACK (XLenVT (PACKH GPR:$op0rs1, GPR:$op0rs2)),
+ (zexti16 (XLenVT GPR:$rs1))),
+ (PACK (XLenVT GPR:$rs1),
(XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index a250ac8..5a5a9ed 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -206,8 +206,6 @@ let Predicates = [HasStdExtZvksh], RVVConstraint = VS2Constraint in {
//===----------------------------------------------------------------------===//
defvar I32IntegerVectors = !filter(vti, AllIntegerVectors, !eq(vti.SEW, 32));
-defvar I32I64IntegerVectors = !filter(vti, AllIntegerVectors,
- !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)));
class ZvkI32IntegerVectors<string vd_lmul> {
list<VTypeInfo> vs2_types = !cond(!eq(vd_lmul, "M8") : !filter(vti, I32IntegerVectors, !le(vti.LMul.octuple, 32)),
@@ -1126,16 +1124,16 @@ let Predicates = [HasStdExtZvkned] in {
defm : VPatUnaryV_S_NoMaskVectorCrypto<"int_riscv_vaesz", "PseudoVAESZ", I32IntegerVectors>;
} // Predicates = [HasStdExtZvkned]
-let Predicates = [HasStdExtZvknha] in {
+let Predicates = [HasStdExtZvknhaOrZvknhb] in {
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>;
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>;
defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>;
} // Predicates = [HasStdExtZvknha]
let Predicates = [HasStdExtZvknhb] in {
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>;
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>;
- defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I64IntegerVectors>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I64IntegerVectors>;
+ defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I64IntegerVectors, isSEWAware=true>;
} // Predicates = [HasStdExtZvknhb]
let Predicates = [HasStdExtZvksed] in {
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
index 6ecddad..041dd07 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "RISCVSelectionDAGInfo.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
#define GET_SDNODE_DESC
#include "RISCVGenSDNodeInfo.inc"
@@ -62,3 +64,94 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
}
#endif
}
+
+SDValue RISCVSelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo) const {
+ const auto &Subtarget = DAG.getSubtarget<RISCVSubtarget>();
+ // We currently do this only for Xqcilsm
+ if (!Subtarget.hasVendorXqcilsm())
+ return SDValue();
+
+ // Do this only if we know the size at compile time.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+
+ uint64_t NumberOfBytesToWrite = ConstantSize->getZExtValue();
+
+ // Do this only if it is word aligned and we write a multiple of 4 bytes.
+ if (!(Alignment >= 4) || !((NumberOfBytesToWrite & 3) == 0))
+ return SDValue();
+
+ SmallVector<SDValue, 8> OutChains;
+ SDValue SrcValueReplicated = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+ int NumberOfWords = NumberOfBytesToWrite / 4;
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto Volatile =
+ isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+
+ // Helper for constructing the QC_SETWMI instruction
+ auto getSetwmiNode = [&](uint8_t SizeWords, uint8_t OffsetSetwmi) -> SDValue {
+ SDValue Ops[] = {Chain, SrcValueReplicated, Dst,
+ DAG.getTargetConstant(SizeWords, dl, MVT::i32),
+ DAG.getTargetConstant(OffsetSetwmi, dl, MVT::i32)};
+ MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand(
+ DstPtrInfo.getWithOffset(OffsetSetwmi),
+ MachineMemOperand::MOStore | Volatile, SizeWords * 4, Align(4));
+ return DAG.getMemIntrinsicNode(RISCVISD::QC_SETWMI, dl,
+ DAG.getVTList(MVT::Other), Ops, MVT::i32,
+ BaseMemOperand);
+ };
+
+ // If i8 type and constant non-zero value.
+ if ((Src.getValueType() == MVT::i8) && !isNullConstant(Src))
+ // Replicate byte to word by multiplication with 0x01010101.
+ SrcValueReplicated =
+ DAG.getNode(ISD::MUL, dl, MVT::i32, SrcValueReplicated,
+ DAG.getConstant(0x01010101ul, dl, MVT::i32));
+
+ // We limit a QC_SETWMI to 16 words or less to improve interruptibility.
+ // So for 1-16 words we use a single QC_SETWMI:
+ //
+ // QC_SETWMI reg1, N, 0(reg2)
+ //
+ // For 17-32 words we use two QC_SETWMI's with the first as 16 words and the
+ // second for the remainder:
+ //
+ // QC_SETWMI reg1, 16, 0(reg2)
+ // QC_SETWMI reg1, N, 64(reg2)
+ //
+ // For 33-48 words, we would like to use (16, 16, n), but that means the last
+ // QC_SETWMI needs an offset of 128 which the instruction doesn't support.
+ // So in this case we use a length of 15 for the second instruction and we do
+ // the rest with the third instruction.
+ // This means the maximum inlined number of words is 47 (for now):
+ //
+ // QC_SETWMI R2, R0, 16, 0
+ // QC_SETWMI R2, R0, 15, 64
+ // QC_SETWMI R2, R0, N, 124
+ //
+ // For 48 words or more, call the target independent memset
+ if (NumberOfWords >= 48)
+ return SDValue();
+
+ if (NumberOfWords <= 16) {
+ // 1 - 16 words
+ return getSetwmiNode(NumberOfWords, 0);
+ }
+
+ if (NumberOfWords <= 32) {
+ // 17 - 32 words
+ OutChains.push_back(getSetwmiNode(NumberOfWords - 16, 64));
+ OutChains.push_back(getSetwmiNode(16, 0));
+ } else {
+ // 33 - 47 words
+ OutChains.push_back(getSetwmiNode(NumberOfWords - 31, 124));
+ OutChains.push_back(getSetwmiNode(15, 64));
+ OutChains.push_back(getSetwmiNode(16, 0));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
index 641189f..08c8d11 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
@@ -34,6 +34,12 @@ public:
void verifyTargetNode(const SelectionDAG &DAG,
const SDNode *N) const override;
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo) const override;
+
bool hasPassthruOp(unsigned Opcode) const {
return GenNodeInfo.getDesc(Opcode).TSFlags & RISCVISD::HasPassthruOpMask;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index da6ac2f..3f2a83f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -642,12 +642,6 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
OptimizationLevel Level) {
LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
});
-
- PB.registerVectorizerEndEPCallback(
- [](FunctionPassManager &FPM, OptimizationLevel Level) {
- if (Level.isOptimizingForSpeed())
- FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass()));
- });
}
yaml::MachineFunctionInfo *
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 0d5eb86..67f924a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -979,11 +979,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) const {
- // The interleaved memory access pass will lower interleaved memory ops (i.e
- // a load and store followed by a specific shuffle) to vlseg/vsseg
- // intrinsics.
- if (!UseMaskForCond && !UseMaskForGaps &&
- Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ // The interleaved memory access pass will lower (de)interleave ops combined
+ // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
+ // only support masking per-iteration (i.e. condition), not per-segment (i.e.
+ // gap).
+ if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
auto *VTy = cast<VectorType>(VecTy);
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
// Need to make sure type has't been scalarized
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d62d99c..05d504c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -398,6 +398,10 @@ public:
bool enableInterleavedAccessVectorization() const override { return true; }
+ bool enableMaskedInterleavedAccessVectorization() const override {
+ return ST->hasVInstructions();
+ }
+
unsigned getMinTripCountTailFoldingThreshold() const override;
enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index c946451..37a71e8 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -69,6 +69,7 @@ struct OperandInfo {
// Represent as 1,2,4,8, ... and fractional indicator. This is because
// EMUL can take on values that don't map to RISCVVType::VLMUL values exactly.
// For example, a mask operand can have an EMUL less than MF8.
+ // If nullopt, then EMUL isn't used (i.e. only a single scalar is read).
std::optional<std::pair<unsigned, bool>> EMUL;
unsigned Log2EEW;
@@ -83,12 +84,14 @@ struct OperandInfo {
OperandInfo() = delete;
- static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
- return A.Log2EEW == B.Log2EEW && A.EMUL == B.EMUL;
- }
-
- static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
- return A.Log2EEW == B.Log2EEW;
+ /// Return true if the EMUL and EEW produced by \p Def is compatible with the
+ /// EMUL and EEW used by \p User.
+ static bool areCompatible(const OperandInfo &Def, const OperandInfo &User) {
+ if (Def.Log2EEW != User.Log2EEW)
+ return false;
+ if (User.EMUL && Def.EMUL != User.EMUL)
+ return false;
+ return true;
}
void print(raw_ostream &OS) const {
@@ -98,7 +101,7 @@ struct OperandInfo {
OS << "f";
OS << EMUL->first;
} else
- OS << "EMUL: unknown\n";
+ OS << "EMUL: none\n";
OS << ", EEW: " << (1 << Log2EEW);
}
};
@@ -1399,13 +1402,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
return std::nullopt;
}
- // If the operand is used as a scalar operand, then the EEW must be
- // compatible. Otherwise, the EMUL *and* EEW must be compatible.
- bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp);
- if ((IsVectorOpUsedAsScalarOp &&
- !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) ||
- (!IsVectorOpUsedAsScalarOp &&
- !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) {
+ if (!OperandInfo::areCompatible(*ProducerInfo, *ConsumerInfo)) {
LLVM_DEBUG(
dbgs()
<< " Abort due to incompatible information for EMUL or EEW.\n");
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
index ef84d43..5710cf2 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp
@@ -21,8 +21,7 @@ public:
SPIRVAsmBackend(llvm::endianness Endian) : MCAsmBackend(Endian) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override {}
+ uint8_t *Data, uint64_t Value, bool IsResolved) override {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index a7f6fbc..64d301e 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -375,7 +375,7 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg())
O << '%' << (getIDFromRegister(Op.getReg().id()) + 1);
else if (Op.isImm())
- O << formatImm((int64_t)Op.getImm());
+ O << formatImm(Op.getImm());
else if (Op.isDFPImm())
O << formatImm((double)Op.getDFPImm());
else if (Op.isExpr())
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 947b574..2c3e087 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -21,7 +21,9 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/TypedPointerType.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <queue>
#include <unordered_set>
@@ -187,6 +189,8 @@ class SPIRVEmitIntrinsics
void applyDemangledPtrArgTypes(IRBuilder<> &B);
+ GetElementPtrInst *simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP);
+
bool runOnFunction(Function &F);
bool postprocessTypes(Module &M);
bool processFunctionPointers(Module &M);
@@ -1458,6 +1462,24 @@ static void createSaturatedConversionDecoration(Instruction *I,
createDecorationIntrinsic(I, SaturatedConversionNode, B);
}
+static void addSaturatedDecorationToIntrinsic(Instruction *I, IRBuilder<> &B) {
+ if (auto *CI = dyn_cast<CallInst>(I)) {
+ if (Function *Fu = CI->getCalledFunction()) {
+ if (Fu->isIntrinsic()) {
+ unsigned const int IntrinsicId = Fu->getIntrinsicID();
+ switch (IntrinsicId) {
+ case Intrinsic::fptosi_sat:
+ case Intrinsic::fptoui_sat:
+ createSaturatedConversionDecoration(I, B);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+}
+
Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) {
if (!Call.isInlineAsm())
return &Call;
@@ -2543,6 +2565,30 @@ void SPIRVEmitIntrinsics::applyDemangledPtrArgTypes(IRBuilder<> &B) {
}
}
+GetElementPtrInst *
+SPIRVEmitIntrinsics::simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP) {
+ // getelementptr [0 x T], P, 0 (zero), I -> getelementptr T, P, I.
+ // If type is 0-length array and first index is 0 (zero), drop both the
+ // 0-length array type and the first index. This is a common pattern in the
+ // IR, e.g. when using a zero-length array as a placeholder for a flexible
+ // array such as unbound arrays.
+ assert(GEP && "GEP is null");
+ Type *SrcTy = GEP->getSourceElementType();
+ SmallVector<Value *, 8> Indices(GEP->indices());
+ ArrayType *ArrTy = dyn_cast<ArrayType>(SrcTy);
+ if (ArrTy && ArrTy->getNumElements() == 0 &&
+ PatternMatch::match(Indices[0], PatternMatch::m_Zero())) {
+ IRBuilder<> Builder(GEP);
+ Indices.erase(Indices.begin());
+ SrcTy = ArrTy->getElementType();
+ Value *NewGEP = Builder.CreateGEP(SrcTy, GEP->getPointerOperand(), Indices,
+ "", GEP->getNoWrapFlags());
+ assert(llvm::isa<GetElementPtrInst>(NewGEP) && "NewGEP should be a GEP");
+ return cast<GetElementPtrInst>(NewGEP);
+ }
+ return nullptr;
+}
+
bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
if (Func.isDeclaration())
return false;
@@ -2560,14 +2606,30 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
AggrConstTypes.clear();
AggrStores.clear();
- // fix GEP result types ahead of inference
+ // Fix GEP result types ahead of inference, and simplify if possible.
+ // Data structure for dead instructions that were simplified and replaced.
+ SmallPtrSet<Instruction *, 4> DeadInsts;
for (auto &I : instructions(Func)) {
auto *Ref = dyn_cast<GetElementPtrInst>(&I);
if (!Ref || GR->findDeducedElementType(Ref))
continue;
+
+ GetElementPtrInst *NewGEP = simplifyZeroLengthArrayGepInst(Ref);
+ if (NewGEP) {
+ Ref->replaceAllUsesWith(NewGEP);
+ if (isInstructionTriviallyDead(Ref))
+ DeadInsts.insert(Ref);
+
+ Ref = NewGEP;
+ }
if (Type *GepTy = getGEPType(Ref))
GR->addDeducedElementType(Ref, normalizeType(GepTy));
}
+ // Remove dead instructions that were simplified and replaced.
+ for (auto *I : DeadInsts) {
+ assert(I->use_empty() && "Dead instruction should not have any uses left");
+ I->eraseFromParent();
+ }
processParamTypesByFunHeader(CurrF, B);
@@ -2640,6 +2702,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
if (isConvergenceIntrinsic(I))
continue;
+ addSaturatedDecorationToIntrinsic(I, B);
processInstrAfterVisit(I, B);
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 83fccdc..f1436d5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -828,6 +828,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
"Invalid array element type");
SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder);
SPIRVType *ArrayType = nullptr;
+ const SPIRVSubtarget &ST =
+ cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget());
if (NumElems != 0) {
Register NumElementsVReg =
buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR);
@@ -838,6 +840,10 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
.addUse(NumElementsVReg);
});
} else {
+ assert(ST.isShader() && "Runtime arrays are not allowed in non-shader "
+ "SPIR-V modules.");
+ if (!ST.isShader())
+ return nullptr;
ArrayType = createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray)
.addDef(createTypeVReg(MIRBuilder))
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d4fa62a..e9f5ffa 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -665,6 +665,11 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
case TargetOpcode::G_FPTOUI:
return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU);
+ case TargetOpcode::G_FPTOSI_SAT:
+ return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToS);
+ case TargetOpcode::G_FPTOUI_SAT:
+ return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU);
+
case TargetOpcode::G_SITOFP:
return selectIToF(ResVReg, ResType, I, true, SPIRV::OpConvertSToF);
case TargetOpcode::G_UITOFP:
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 1995e0f..170bddd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -203,6 +203,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
.legalForCartesianProduct(allIntScalarsAndVectors,
allFloatScalarsAndVectors);
+ getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
+ .legalForCartesianProduct(allIntScalarsAndVectors,
+ allFloatScalarsAndVectors);
+
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalForCartesianProduct(allFloatScalarsAndVectors,
allScalarsAndVectors);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 0cd9d78..ab06fc0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -744,8 +744,14 @@ void SPIRV::RequirementHandler::checkSatisfiable(
IsSatisfiable = false;
}
+ AvoidCapabilitiesSet AvoidCaps;
+ if (!ST.isShader())
+ AvoidCaps.S.insert(SPIRV::Capability::Shader);
+ else
+ AvoidCaps.S.insert(SPIRV::Capability::Kernel);
+
for (auto Cap : MinimalCaps) {
- if (AvailableCaps.contains(Cap))
+ if (AvailableCaps.contains(Cap) && !AvoidCaps.S.contains(Cap))
continue;
LLVM_DEBUG(dbgs() << "Capability not supported: "
<< getSymbolicOperandMnemonic(
@@ -1865,6 +1871,11 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::TernaryBitwiseFunctionINTEL);
break;
}
+ case SPIRV::OpCopyMemorySized: {
+ Reqs.addCapability(SPIRV::Capability::Addresses);
+ // TODO: Add UntypedPointersKHR when implemented.
+ break;
+ }
default:
break;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index ba023af..bc60842 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -127,8 +127,7 @@ public:
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override {
@@ -253,21 +252,19 @@ MCFixupKindInfo SparcAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void SparcAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
if (!IsResolved)
return;
Value = adjustFixupValue(Fixup.getKind(), Value);
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- unsigned Offset = Fixup.getOffset();
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i;
- Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index a95c4ff..d2071c3 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -58,8 +58,8 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
case ELF::R_SPARC_TLS_IE_ADD:
case ELF::R_SPARC_TLS_LE_HIX22:
case ELF::R_SPARC_TLS_LE_LOX10:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
index 8b5587a..1bca5c7 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
@@ -111,8 +111,8 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
case SystemZ::S_TLSLD:
case SystemZ::S_TLSLDM:
case SystemZ::S_DTPOFF:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index b2cfd04..d692cbe 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -113,8 +113,7 @@ public:
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
};
@@ -152,20 +151,18 @@ MCFixupKindInfo SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
}
void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
if (Target.getSpecifier())
IsResolved = false;
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
MCFixupKind Kind = Fixup.getKind();
if (mc::isRelocation(Kind))
return;
- unsigned Offset = Fixup.getOffset();
unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
unsigned Size = (BitSize + 7) / 8;
- assert(Offset + Size <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!");
// Big-endian insertion of Size bytes.
Value = extractBitsForFixup(Kind, Value, Fixup, getContext());
@@ -173,7 +170,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value &= ((uint64_t)1 << BitSize) - 1;
unsigned ShiftValue = (Size * 8) - 8;
for (unsigned I = 0; I != Size; ++I) {
- Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+ Data[I] |= uint8_t(Value >> ShiftValue);
ShiftValue -= 8;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index ae6ca55a36..783f86a 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1286,7 +1286,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
if ((Opcode == SystemZ::ALFI && OpNum == 0 &&
isInt<8>((int32_t)MI.getOperand(2).getImm())) ||
(Opcode == SystemZ::ALGFI && OpNum == 0 &&
- isInt<8>((int64_t)MI.getOperand(2).getImm()))) {
+ isInt<8>(MI.getOperand(2).getImm()))) {
// AL(G)FI %reg, CONST -> AL(G)SI %mem, CONST
Opcode = (Opcode == SystemZ::ALFI ? SystemZ::ALSI : SystemZ::ALGSI);
MachineInstr *BuiltMI =
@@ -1301,7 +1301,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
if ((Opcode == SystemZ::SLFI && OpNum == 0 &&
isInt<8>((int32_t)-MI.getOperand(2).getImm())) ||
(Opcode == SystemZ::SLGFI && OpNum == 0 &&
- isInt<8>((int64_t)-MI.getOperand(2).getImm()))) {
+ isInt<8>((-MI.getOperand(2).getImm())))) {
// SL(G)FI %reg, CONST -> AL(G)SI %mem, -CONST
Opcode = (Opcode == SystemZ::SLFI ? SystemZ::ALSI : SystemZ::ALGSI);
MachineInstr *BuiltMI =
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index b02b6af..c1b9d9f 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -112,8 +112,7 @@ public:
}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
- MutableArrayRef<char>, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override {
@@ -152,7 +151,7 @@ public:
} // end anonymous namespace
void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target, MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *Data,
uint64_t Value, bool IsResolved) {
switch (Fixup.getKind()) {
case VE::fixup_ve_tls_gd_hi32:
@@ -173,14 +172,14 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Value <<= Info.TargetOffset;
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits
// from the fixup value. The Value has been "split up" into the
// appropriate bitfields above.
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i;
- Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+ Data[Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index 41f31eb..c702064 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -44,8 +44,8 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
case VE::S_TLS_GD_LO32:
case VE::S_TPOFF_HI32:
case VE::S_TPOFF_LO32:
- if (auto *SA = Target.getAddSym())
- cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
+ if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 6ae69a4..80df4ed 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -212,12 +212,12 @@ static wasm::WasmLimits defaultLimits() {
static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
const StringRef &Name,
bool Is64) {
- MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
} else {
- Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+ Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
Sym->setFunctionTable(Is64);
// The default function table is synthesized by the linker.
Sym->setUndefined();
@@ -703,7 +703,7 @@ public:
ExpectBlockType = false;
// The "true" here will cause this to be a nameless symbol.
MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true);
- auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ auto *WasmSym = static_cast<MCSymbolWasm *>(Sym);
WasmSym->setSignature(Signature);
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
const MCExpr *Expr =
@@ -949,7 +949,8 @@ public:
return error("Unknown type in .globaltype modifier: ", TypeTok);
}
// Now set this symbol with the correct type.
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(*Type), Mutable});
// And emit the directive again.
@@ -980,7 +981,8 @@ public:
// Now that we have the name and table type, we can actually create the
// symbol
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
if (Is64) {
Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64;
@@ -1000,7 +1002,8 @@ public:
auto SymName = expectIdent();
if (SymName.empty())
return ParseStatus::Failure;
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
if (WasmSym->isDefined()) {
// We push 'Function' either when a label is parsed or a .functype
// directive is parsed. The reason it is not easy to do this uniformly
@@ -1042,7 +1045,8 @@ public:
auto ExportName = expectIdent();
if (ExportName.empty())
return ParseStatus::Failure;
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setExportName(Ctx.allocateString(ExportName));
TOut.emitExportName(WasmSym, ExportName);
return expect(AsmToken::EndOfStatement, "EOL");
@@ -1057,7 +1061,8 @@ public:
auto ImportModule = expectIdent();
if (ImportModule.empty())
return ParseStatus::Failure;
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setImportModule(Ctx.allocateString(ImportModule));
TOut.emitImportModule(WasmSym, ImportModule);
return expect(AsmToken::EndOfStatement, "EOL");
@@ -1072,7 +1077,8 @@ public:
auto ImportName = expectIdent();
if (ImportName.empty())
return ParseStatus::Failure;
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setImportName(Ctx.allocateString(ImportName));
TOut.emitImportName(WasmSym, ImportName);
return expect(AsmToken::EndOfStatement, "EOL");
@@ -1082,7 +1088,8 @@ public:
auto SymName = expectIdent();
if (SymName.empty())
return ParseStatus::Failure;
- auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ auto *WasmSym =
+ static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName));
auto *Signature = Ctx.createWasmSignature();
if (parseRegTypeList(Signature->Params))
return ParseStatus::Failure;
@@ -1224,7 +1231,7 @@ public:
if (!CWS->isText())
return;
- auto *WasmSym = cast<MCSymbolWasm>(Symbol);
+ auto *WasmSym = static_cast<MCSymbolWasm *>(Symbol);
// Unlike other targets, we don't allow data in text sections (labels
// declared with .type @object).
if (WasmSym->getType() == wasm::WASM_SYMBOL_TYPE_DATA) {
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 4a305ab..6943888 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -258,7 +258,7 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc,
const MCSymbolRefExpr *SymRef;
if (getSymRef(ErrorLoc, GlobalOp, SymRef))
return true;
- const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+ auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol());
switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) {
case wasm::WASM_SYMBOL_TYPE_GLOBAL:
Type = static_cast<wasm::ValType>(WasmSym->getGlobalType().Type);
@@ -286,7 +286,7 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp,
const MCSymbolRefExpr *SymRef;
if (getSymRef(ErrorLoc, TableOp, SymRef))
return true;
- const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+ auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol());
if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) !=
wasm::WASM_SYMBOL_TYPE_TABLE)
return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
@@ -302,7 +302,7 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc,
const MCSymbolRefExpr *SymRef = nullptr;
if (getSymRef(ErrorLoc, SigOp, SymRef))
return true;
- const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+ auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol());
Sig = WasmSym->getSignature();
if (!Sig || WasmSym->getType() != Type) {
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0f7b27b..2a398d4 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -237,7 +237,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
} else {
// We don't have access to the signature, so create a symbol without one
MCSymbol *Sym = getContext().createTempSymbol("typeindex", true);
- auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ auto *WasmSym = static_cast<MCSymbolWasm *>(Sym);
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
const MCExpr *Expr = MCSymbolRefExpr::create(
WasmSym, WebAssembly::S_TYPEINDEX, getContext());
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 84eb15f..eecef31 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -39,7 +39,7 @@ public:
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value, bool) override;
+ uint8_t *Data, uint64_t Value, bool) override;
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
@@ -80,8 +80,7 @@ bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
void WebAssemblyAsmBackend::applyFixup(const MCFragment &F,
const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data,
+ const MCValue &Target, uint8_t *Data,
uint64_t Value, bool IsResolved) {
if (!IsResolved)
Asm->getWriter().recordRelocation(F, Fixup, Target, Value);
@@ -96,13 +95,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F,
// Shift the value into position.
Value <<= Info.TargetOffset;
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!");
+ assert(Fixup.getOffset() + NumBytes <= F.getSize() &&
+ "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
for (unsigned I = 0; I != NumBytes; ++I)
- Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff);
+ Data[I] |= uint8_t((Value >> (I * 8)) & 0xff);
}
std::unique_ptr<MCObjectTargetWriter>
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index 2e97215..d8bfed9 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -380,7 +380,7 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
O << WebAssembly::anyTypeToString(Imm);
} else {
auto Expr = cast<MCSymbolRefExpr>(Op.getExpr());
- auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol());
+ auto *Sym = static_cast<const MCSymbolWasm *>(&Expr->getSymbol());
if (Sym->getSignature()) {
O << WebAssembly::signatureToString(Sym->getSignature());
} else {
@@ -398,10 +398,10 @@ void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo,
auto PrintTagOp = [&](const MCOperand &Op) {
const MCSymbolRefExpr *TagExpr = nullptr;
- const MCSymbolWasm *TagSym = nullptr;
+ const MCSymbol *TagSym = nullptr;
if (Op.isExpr()) {
TagExpr = cast<MCSymbolRefExpr>(Op.getExpr());
- TagSym = cast<MCSymbolWasm>(&TagExpr->getSymbol());
+ TagSym = &TagExpr->getSymbol();
O << TagSym->getName() << " ";
} else {
// When instructions are parsed from the disassembler, we have an
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index cbaf10f..7096104 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -107,7 +107,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
encodeULEB128(uint32_t(MO.getImm()), OS);
break;
case WebAssembly::OPERAND_I64IMM:
- encodeSLEB128(int64_t(MO.getImm()), OS);
+ encodeSLEB128(MO.getImm(), OS);
break;
case WebAssembly::OPERAND_SIGNATURE:
case WebAssembly::OPERAND_VEC_I8IMM:
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 2cf4bec..ffbc7e1 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -66,7 +66,7 @@ static const MCSection *getTargetSection(const MCExpr *Expr) {
unsigned WebAssemblyWasmObjectWriter::getRelocType(
const MCValue &Target, const MCFixup &Fixup,
const MCSectionWasm &FixupSection, bool IsLocRel) const {
- auto &SymA = cast<MCSymbolWasm>(*Target.getAddSym());
+ auto &SymA = static_cast<const MCSymbolWasm &>(*Target.getAddSym());
auto Spec = WebAssembly::Specifier(Target.getSpecifier());
switch (Spec) {
case WebAssembly::S_GOT:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 1bf070e..db832bc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -171,10 +171,10 @@ MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction(
WebAssembly::signatureToString(Sig);
report_fatal_error(Twine(Msg));
}
- WasmSym = cast<MCSymbolWasm>(
+ WasmSym = static_cast<MCSymbolWasm *>(
GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig)));
} else {
- WasmSym = cast<MCSymbolWasm>(getSymbol(F));
+ WasmSym = static_cast<MCSymbolWasm *>(getSymbol(F));
}
return WasmSym;
}
@@ -186,9 +186,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
}
assert(!GV->isThreadLocal());
-
- MCSymbolWasm *Sym = cast<MCSymbolWasm>(getSymbol(GV));
-
+ auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(GV));
if (!Sym->getType()) {
SmallVector<MVT, 1> VTs;
Type *GlobalVT = GV->getValueType();
@@ -218,8 +216,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
}
MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
- auto *WasmSym = cast<MCSymbolWasm>(GetExternalSymbolSymbol(Name));
-
+ auto *WasmSym = static_cast<MCSymbolWasm *>(GetExternalSymbolSymbol(Name));
// May be called multiple times, so early out.
if (WasmSym->getType())
return WasmSym;
@@ -312,7 +309,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) {
// not be found here.
MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
for (StringRef Name : MMIW.MachineSymbolsUsed) {
- auto *WasmSym = cast<MCSymbolWasm>(getOrCreateWasmSymbol(Name));
+ auto *WasmSym = static_cast<MCSymbolWasm *>(getOrCreateWasmSymbol(Name));
if (WasmSym->isFunction()) {
// TODO(wvo): is there any case where this overlaps with the call to
// emitFunctionType in the loop below?
@@ -324,7 +321,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) {
// Emit .globaltype, .tagtype, or .tabletype declarations for extern
// declarations, i.e. those that have only been declared (but not defined)
// in the current module
- auto Sym = cast_or_null<MCSymbolWasm>(It.getValue().Symbol);
+ auto Sym = static_cast<MCSymbolWasm *>(It.getValue().Symbol);
if (Sym && !Sym->isDefined())
emitSymbolType(Sym);
}
@@ -381,7 +378,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) {
}
if (F.hasFnAttribute("wasm-export-name")) {
- auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
+ auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(&F));
StringRef Name = F.getFnAttribute("wasm-export-name").getValueAsString();
Sym->setExportName(OutContext.allocateString(Name));
getTargetStreamer()->emitExportName(Sym, Name);
@@ -581,7 +578,7 @@ void WebAssemblyAsmPrinter::EmitFunctionAttributes(Module &M) {
auto *GV = cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts());
StringRef AnnotationString;
getConstantStringInfo(GV, AnnotationString);
- auto *Sym = cast<MCSymbolWasm>(getSymbol(F));
+ auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(F));
CustomSections[AnnotationString].push_back(Sym);
}
@@ -618,7 +615,7 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
computeSignatureVTs(F.getFunctionType(), &F, F, TM, ParamVTs, ResultVTs);
auto Signature = signatureFromMVTs(OutContext, ResultVTs, ParamVTs);
- auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym);
+ auto *WasmSym = static_cast<MCSymbolWasm *>(CurrentFnSym);
WasmSym->setSignature(Signature);
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 4613fcb..e48283a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -52,7 +52,7 @@ MCSymbol *
WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
const GlobalValue *Global = MO.getGlobal();
if (!isa<Function>(Global)) {
- auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
+ auto *WasmSym = static_cast<MCSymbolWasm *>(Printer.getSymbol(Global));
// If the symbol doesn't have an explicit WasmSymbolType yet and the
// GlobalValue is actually a WebAssembly global, then ensure the symbol is a
// WASM_SYMBOL_TYPE_GLOBAL.
@@ -123,7 +123,7 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Spec, Ctx);
if (MO.getOffset() != 0) {
- const auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ const auto *WasmSym = static_cast<const MCSymbolWasm *>(Sym);
if (TargetFlags == WebAssemblyII::MO_GOT)
report_fatal_error("GOT symbol references do not support offsets");
if (WasmSym->isFunction())
@@ -148,12 +148,12 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
auto Signature = Ctx.createWasmSignature();
Signature->Returns = std::move(Returns);
Signature->Params = std::move(Params);
- MCSymbol *Sym = Printer.createTempSymbol("typeindex");
- auto *WasmSym = cast<MCSymbolWasm>(Sym);
- WasmSym->setSignature(Signature);
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ auto *Sym =
+ static_cast<MCSymbolWasm *>(Printer.createTempSymbol("typeindex"));
+ Sym->setSignature(Signature);
+ Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
const MCExpr *Expr =
- MCSymbolRefExpr::create(WasmSym, WebAssembly::S_TYPEINDEX, Ctx);
+ MCSymbolRefExpr::create(Sym, WebAssembly::S_TYPEINDEX, Ctx);
return MCOperand::createExpr(Expr);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 747ef18..42d1271 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -104,13 +104,13 @@ const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) {
MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
MCContext &Ctx, const WebAssemblySubtarget *Subtarget) {
StringRef Name = "__indirect_function_table";
- MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
} else {
bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit();
- Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+ Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
Sym->setFunctionTable(is64);
// The default function table is synthesized by the linker.
Sym->setUndefined();
@@ -124,12 +124,12 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
MCSymbolWasm *WebAssembly::getOrCreateFuncrefCallTableSymbol(
MCContext &Ctx, const WebAssemblySubtarget *Subtarget) {
StringRef Name = "__funcref_call_table";
- MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
} else {
- Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+ Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name));
// Setting Weak ensure only one table is left after linking when multiple
// modules define the table.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 1efef83..56a4cc3 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -174,8 +174,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override;
@@ -512,9 +511,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
isFirstMacroFusibleInst(Inst, *MCII))) {
// If we meet a unfused branch or the first instuction in a fusiable pair,
// insert a BoundaryAlign fragment.
- PendingBA = OS.getContext().allocFragment<MCBoundaryAlignFragment>(
- AlignBoundary, STI);
- OS.insert(PendingBA);
+ PendingBA =
+ OS.newSpecialFragment<MCBoundaryAlignFragment>(AlignBoundary, STI);
}
}
@@ -676,9 +674,8 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &,
}
void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
// Force relocation when there is a specifier. This might be too conservative
// - GAS doesn't emit a relocation for call local@plt; local:.
if (Target.getSpecifier())
@@ -710,7 +707,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
for (unsigned i = 0; i != Size; ++i)
- Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ Data[i] = uint8_t(Value >> (i * 8));
}
bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 3323b38..ea0abdd 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -349,8 +349,8 @@ unsigned X86ELFObjectWriter::getRelocType(const MCFixup &Fixup,
case X86::S_TLSLDM:
case X86::S_TPOFF:
case X86::S_DTPOFF:
- if (auto *S = Target.getAddSym())
- cast<MCSymbolELF>(S)->setType(ELF::STT_TLS);
+ if (auto *S = const_cast<MCSymbol *>(Target.getAddSym()))
+ static_cast<MCSymbolELF *>(S)->setType(ELF::STT_TLS);
break;
default:
break;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index b8e117b..ff27005 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -369,7 +369,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
printRegName(O, Op.getReg());
} else if (Op.isImm()) {
- markup(O, Markup::Immediate) << formatImm((int64_t)Op.getImm());
+ markup(O, Markup::Immediate) << formatImm(Op.getImm());
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
O << "offset ";
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bbbb1d9..f366094 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8279,8 +8279,8 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1,
- unsigned &NumExtracts,
- bool &IsSubAdd) {
+ unsigned &NumExtracts, bool &IsSubAdd,
+ bool &HasAllowContract) {
using namespace SDPatternMatch;
MVT VT = BV->getSimpleValueType(0);
@@ -8292,6 +8292,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
SDValue InVec1 = DAG.getUNDEF(VT);
NumExtracts = 0;
+ HasAllowContract = NumElts != 0;
// Odd-numbered elements in the input build vector are obtained from
// adding/subtracting two integer/float elements.
@@ -8350,6 +8351,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
// Increment the number of extractions done.
++NumExtracts;
+ HasAllowContract &= Op->getFlags().hasAllowContract();
}
// Ensure we have found an opcode for both parities and that they are
@@ -8393,9 +8395,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
/// FMADDSUB is.
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
- SelectionDAG &DAG,
- SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
- unsigned ExpectedUses) {
+ SelectionDAG &DAG, SDValue &Opnd0,
+ SDValue &Opnd1, SDValue &Opnd2,
+ unsigned ExpectedUses,
+ bool AllowSubAddOrAddSubContract) {
if (Opnd0.getOpcode() != ISD::FMUL ||
!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
return false;
@@ -8406,7 +8409,8 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
// or MUL + ADDSUB to FMADDSUB.
const TargetOptions &Options = DAG.getTarget().Options;
bool AllowFusion =
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+ Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
if (!AllowFusion)
return false;
@@ -8427,15 +8431,17 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
SDValue Opnd0, Opnd1;
unsigned NumExtracts;
bool IsSubAdd;
- if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
- IsSubAdd))
+ bool HasAllowContract;
+ if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
+ HasAllowContract))
return SDValue();
MVT VT = BV->getSimpleValueType(0);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
+ HasAllowContract)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
}
@@ -9132,11 +9138,17 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue SrcVec, IndicesVec;
+
+ auto PeekThroughFreeze = [](SDValue N) {
+ if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
+ return N->getOperand(0);
+ return N;
+ };
// Check for a match of the permute source vector and permute index elements.
// This is done by checking that the i-th build_vector operand is of the form:
// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
- SDValue Op = V.getOperand(Idx);
+ SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
@@ -23486,7 +23498,6 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
}
// Try to shrink i64 compares if the input has enough zero bits.
- // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
@@ -23496,6 +23507,16 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
}
+ // Try to shrink all i64 compares if the inputs are representable as signed
+ // i32.
+ if (CmpVT == MVT::i64 &&
+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
+ DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
+ }
+
// 0-x == y --> x+y == 0
// 0-x != y --> x+y != 0
if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
@@ -43165,7 +43186,7 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
/// the fact that they're unused.
static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
- bool &IsSubAdd) {
+ bool &IsSubAdd, bool &HasAllowContract) {
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -43216,6 +43237,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
// It's a subadd if the vector in the even parity is an FADD.
IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
: V2->getOpcode() == ISD::FADD;
+ HasAllowContract =
+ V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract();
Opnd0 = LHS;
Opnd1 = RHS;
@@ -43273,14 +43296,17 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
SDValue Opnd0, Opnd1;
bool IsSubAdd;
- if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
+ bool HasAllowContract;
+ if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
+ HasAllowContract))
return SDValue();
MVT VT = N->getSimpleValueType(0);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
+ HasAllowContract)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
}
@@ -54220,7 +54246,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
}
// Try to form a MULHU or MULHS node by looking for
-// (trunc (srl (mul ext, ext), 16))
+// (trunc (srl (mul ext, ext), >= 16))
// TODO: This is X86 specific because we want to be able to handle wide types
// before type legalization. But we can only do it if the vector will be
// legalized via widening/splitting. Type legalization can't handle promotion
@@ -54245,10 +54271,16 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
// First instruction should be a right shift by 16 of a multiply.
SDValue LHS, RHS;
+ APInt ShiftAmt;
if (!sd_match(Src,
- m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16))))
+ m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
return SDValue();
+ if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
+ return SDValue();
+
+ uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
+
// Count leading sign/zero bits on both inputs - if there are enough then
// truncation back to vXi16 will be cheap - either as a pack/shuffle
// sequence or using AVX512 truncations. If the inputs are sext/zext then the
@@ -54286,7 +54318,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
InVT.getSizeInBits() / 16);
SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
DAG.getBitcast(BCVT, RHS));
- return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
+ return DAG.getNode(ISD::SRL, DL, VT, Res,
+ DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
}
// Truncate back to source type.
@@ -54294,7 +54328,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
+ return DAG.getNode(ISD::SRL, DL, VT, Res,
+ DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
}
// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 5862c7e..7c594d0 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2781,6 +2781,38 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return Bytes == MFI.getObjectSize(FI);
}
+static bool
+mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
+ Register CallerSRetReg) {
+ const auto &Outs = CLI.Outs;
+ const auto &OutVals = CLI.OutVals;
+
+ // We know the caller has a sret pointer argument (CallerSRetReg). Locate the
+ // operand index within the callee that may have a sret pointer too.
+ unsigned Pos = 0;
+ for (unsigned E = Outs.size(); Pos != E; ++Pos)
+ if (Outs[Pos].Flags.isSRet())
+ break;
+ // Bail out if the callee has not any sret argument.
+ if (Pos == Outs.size())
+ return false;
+
+ // At this point, either the caller is forwarding its sret argument to the
+ // callee, or the callee is being passed a different sret pointer. We now look
+ // for a CopyToReg, where the callee sret argument is written into a new vreg
+ // (which should later be %rax/%eax, if this is returned).
+ SDValue SRetArgVal = OutVals[Pos];
+ for (SDNode *User : SRetArgVal->users()) {
+ if (User->getOpcode() != ISD::CopyToReg)
+ continue;
+ Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+ if (Reg == CallerSRetReg && User->getOperand(2) == SRetArgVal)
+ return true;
+ }
+
+ return false;
+}
+
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
/// Note that the x86 backend does not check musttail calls for eligibility! The
@@ -2802,6 +2834,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
@@ -2838,14 +2871,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
if (RegInfo->hasStackRealignment(MF))
return false;
- // Also avoid sibcall optimization if we're an sret return fn and the callee
- // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
- // insufficient.
- if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
+ // Avoid sibcall optimization if we are an sret return function and the callee
+ // is incompatible, unless such premises are proven wrong. See comment in
+ // LowerReturn about why hasStructRetAttr is insufficient.
+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
// For a compatible tail call the callee must return our sret pointer. So it
// needs to be (a) an sret function itself and (b) we pass our sret as its
// sret. Condition #b is harder to determine.
- return false;
+ if (!mayBeSRetTailCallCompatible(CLI, SRetReg))
+ return false;
} else if (IsCalleePopSRet)
// The callee pops an sret, so we cannot tail-call, as our caller doesn't
// expect that.
@@ -2967,8 +3001,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt);
- if (unsigned BytesToPop =
- MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) {
// If we have bytes to pop, the callee must pop them.
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
if (!CalleePopMatches)
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
index 9167794..08936ad 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
@@ -37,8 +37,7 @@ public:
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) override;
+ uint8_t *Data, uint64_t Value, bool IsResolved) override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
@@ -153,9 +152,8 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F,
}
void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
- const MCValue &Target,
- MutableArrayRef<char> Data, uint64_t Value,
- bool IsResolved) {
+ const MCValue &Target, uint8_t *Data,
+ uint64_t Value, bool IsResolved) {
maybeAddReloc(F, Fixup, Target, Value, IsResolved);
MCContext &Ctx = getContext();
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
@@ -168,11 +166,10 @@ void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (!Value)
return; // Doesn't change encoding.
- unsigned Offset = Fixup.getOffset();
unsigned FullSize = getSize(Fixup.getKind());
for (unsigned i = 0; i != FullSize; ++i) {
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ Data[i] |= uint8_t((Value >> (i * 8)) & 0xff);
}
}