aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2024-04-04 17:49:07 -0700
committerVitaly Buka <vitalybuka@google.com>2024-04-04 17:49:07 -0700
commita724510541fc3272c9d4415c89b4549d8d149675 (patch)
tree5090317c71cf2ae73fb91a32f8dd6f8e037e4603 /llvm/lib/Target
parent2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310 (diff)
parentb76eb1ddfbacda273b8e6a9940f1da6812fdc2e0 (diff)
downloadllvm-a724510541fc3272c9d4415c89b4549d8d149675.zip
llvm-a724510541fc3272c9d4415c89b4549d8d149675.tar.gz
llvm-a724510541fc3272c9d4415c89b4549d8d149675.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.rename-remove-traps-to-lower-builtin-hot
Created using spr 1.3.4 [skip ci]
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td53
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp29
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp6
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp13
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp33
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/DSDIRInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td78
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td19
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td50
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td1
-rw-r--r--llvm/lib/Target/Mips/Mips32r6InstrInfo.td8
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp184
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp61
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp30
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrGISel.td25
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td35
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp13
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp19
-rw-r--r--llvm/lib/Target/Sparc/SparcAsmPrinter.cpp44
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td5
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp32
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td116
-rw-r--r--llvm/lib/Target/X86/X86SchedBroadwell.td12
-rw-r--r--llvm/lib/Target/X86/X86SchedHaswell.td15
-rw-r--r--llvm/lib/Target/X86/X86SchedSapphireRapids.td14
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td6
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver3.td86
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp7
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp4
49 files changed, 741 insertions, 413 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6425aa9..3af427d 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -391,9 +391,18 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
"equivalent when the immediate does "
"not fit in the encoding.">;
-def FeatureAddrLSLFast : SubtargetFeature<
- "addr-lsl-fast", "HasAddrLSLFast", "true",
- "Address operands with logical shift of up to 3 places are cheap">;
+// Address operands with shift amount 2 or 3 are fast on all Arm chips except
+// some old Apple cores (A7-A10?) which handle all shifts slowly. Cortex-A57
+// and derived designs through Cortex-X1 take an extra micro-op for shifts
+// of 1 or 4. Other Arm chips handle all shifted operands at the same speed
+// as unshifted operands.
+//
+// We don't try to model the behavior of the old Apple cores because new code
+// targeting A7 is very unlikely to actually run on an A7. The Cortex cores
+// are modeled by FeatureAddrLSLSlow14.
+def FeatureAddrLSLSlow14 : SubtargetFeature<
+ "addr-lsl-slow-14", "HasAddrLSLSlow14", "true",
+ "Address operands with shift amount of 1 or 4 are slow">;
def FeatureALULSLFast : SubtargetFeature<
"alu-lsl-fast", "HasALULSLFast", "true",
@@ -885,6 +894,7 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureBalanceFPOps,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
+ FeatureAddrLSLSlow14,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -903,6 +913,7 @@ def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
+ FeatureAddrLSLSlow14,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -910,6 +921,7 @@ def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
"Cortex-A73 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureAddrLSLSlow14,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -917,6 +929,7 @@ def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
"Cortex-A75 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureAddrLSLSlow14,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -924,7 +937,7 @@ def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -934,7 +947,7 @@ def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -944,7 +957,7 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -956,7 +969,7 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -968,7 +981,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -979,7 +992,6 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -990,7 +1002,6 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureCmpBccFusion,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureEnableSelectOptimize,
@@ -1001,7 +1012,6 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureCmpBccFusion,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureEnableSelectOptimize,
@@ -1012,7 +1022,6 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureCmpBccFusion,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureEnableSelectOptimize,
@@ -1028,7 +1037,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1039,7 +1048,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1047,7 +1055,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
"Cortex-X3 ARM processors", [
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureFuseAES,
@@ -1057,7 +1064,6 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
"Cortex-X4 ARM processors", [
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureFuseAES,
@@ -1215,7 +1221,6 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureStorePairSuppress,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive]>;
@@ -1234,7 +1239,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureStorePairSuppress,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureZCZeroing]>;
@@ -1244,7 +1248,6 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureStorePairSuppress]>;
@@ -1254,7 +1257,6 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureStorePairSuppress,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureSlowSTRQro]>;
@@ -1268,7 +1270,7 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
"Neoverse N1 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1278,7 +1280,6 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
"Neoverse N2 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1288,7 +1289,6 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
"Neoverse 512-TVB ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1298,7 +1298,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
"Neoverse V1 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
+ FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1309,7 +1309,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
"Neoverse V2 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -1321,7 +1320,6 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureStorePairSuppress,
- FeatureAddrLSLFast,
FeatureALULSLFast]>;
def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
@@ -1381,7 +1379,6 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
FeaturePostRAScheduler,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
@@ -1397,7 +1394,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
FeaturePostRAScheduler,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
@@ -1414,7 +1410,6 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeaturePostRAScheduler,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureAddrLSLFast,
FeatureALULSLFast,
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 4fa719a..f6ccd0e 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -268,13 +268,19 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
if (Sign->getZExtValue())
Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
- if (Flags == 0)
- return;
+ uint64_t PAuthABIPlatform = -1;
+ if (const auto *PAP = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("aarch64-elf-pauthabi-platform")))
+ PAuthABIPlatform = PAP->getZExtValue();
+ uint64_t PAuthABIVersion = -1;
+ if (const auto *PAV = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("aarch64-elf-pauthabi-version")))
+ PAuthABIVersion = PAV->getZExtValue();
// Emit a .note.gnu.property section with the flags.
auto *TS =
static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
- TS->emitNoteSection(Flags);
+ TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion);
}
void AArch64AsmPrinter::emitFunctionHeaderComment() {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 163ed52..51bec36 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -462,7 +462,7 @@ private:
SDValue &Offset, SDValue &SignExtend,
SDValue &DoShift);
bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
- bool isWorthFoldingAddr(SDValue V) const;
+ bool isWorthFoldingAddr(SDValue V, unsigned Size) const;
bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
SDValue &Offset, SDValue &SignExtend);
@@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) {
/// Determine whether it is worth to fold V into an extended register addressing
/// mode.
-bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const {
// Trivial if we are optimizing for code size or if there is only
// one use of the value.
if (CurDAG->shouldOptForSize() || V.hasOneUse())
return true;
- // If a subtarget has a fastpath LSL we can fold a logical shift into
- // the addressing mode and save a cycle.
- if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
- isWorthFoldingSHL(V))
+
+ // If a subtarget has a slow shift, folding a shift into multiple loads
+ // costs additional micro-ops.
+ if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16))
+ return false;
+
+ // Check whether we're going to emit the address arithmetic anyway because
+ // it's used by a non-address operation.
+ if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V))
return true;
- if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
+ if (V.getOpcode() == ISD::ADD) {
const SDValue LHS = V.getOperand(0);
const SDValue RHS = V.getOperand(1);
if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
return false;
- return isWorthFoldingAddr(N);
+ return isWorthFoldingAddr(N, Size);
}
bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
}
// Remember if it is worth folding N when it produces extended register.
- bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+ bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
MVT::i32);
- if (isWorthFoldingAddr(LHS))
+ if (isWorthFoldingAddr(LHS, Size))
return true;
}
@@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
MVT::i32);
- if (isWorthFoldingAddr(RHS))
+ if (isWorthFoldingAddr(RHS, Size))
return true;
}
@@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
}
// Remember if it is worth folding N when it produces extended register.
- bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+ bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d0c5e6b..22687b0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
return false;
Shift = AArch64_AM::getShiftValue(Shift);
if (!OptSize) {
- if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
+ if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
return false;
if (avoidSlowSTRQ(MemI))
return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a8f2c45..d4daf17 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
MI.getParent()->getParent()->getFunction().hasOptSize())
return true;
- // It's better to avoid folding and recomputing shifts when we don't have a
- // fastpath.
- if (!STI.hasAddrLSLFast())
- return false;
+ // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
+ // appropriate.
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 33dba6a5..043f142 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1141,9 +1141,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.scalarize(1)
.lower();
- getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
-
getActionDefinitionsBuilder({G_FSHL, G_FSHR})
.customFor({{s32, s32}, {s32, s64}, {s64, s64}})
.lower();
@@ -1191,8 +1188,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.minScalarEltSameAsIf(always, 1, 0)
.maxScalarEltSameAsIf(always, 1, 0);
- // TODO: Vector types.
- getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
+ getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+ .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
+ .clampNumElements(0, v8s8, v16s8)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampMaxNumElements(0, s64, 2)
+ .moreElementsToNextPow2(0)
+ .lower();
// TODO: Libcall support for s128.
// TODO: s16 should be legal with full FP16 support.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index e1d6dd7..dc5383c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -58,8 +58,17 @@ void AArch64TargetStreamer::finish() {
emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
}
-void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
- if (Flags == 0)
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags,
+ uint64_t PAuthABIPlatform,
+ uint64_t PAuthABIVersion) {
+ assert((PAuthABIPlatform == uint64_t(-1)) ==
+ (PAuthABIVersion == uint64_t(-1)));
+ uint64_t DescSz = 0;
+ if (Flags != 0)
+ DescSz += 4 * 4;
+ if (PAuthABIPlatform != uint64_t(-1))
+ DescSz += 4 + 4 + 8 * 2;
+ if (DescSz == 0)
return;
MCStreamer &OutStreamer = getStreamer();
@@ -80,15 +89,25 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
// Emit the note header.
OutStreamer.emitValueToAlignment(Align(8));
OutStreamer.emitIntValue(4, 4); // data size for "GNU\0"
- OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+ OutStreamer.emitIntValue(DescSz, 4); // Elf_Prop array size
OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
// Emit the PAC/BTI properties.
- OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
- OutStreamer.emitIntValue(4, 4); // data size
- OutStreamer.emitIntValue(Flags, 4); // data
- OutStreamer.emitIntValue(0, 4); // pad
+ if (Flags != 0) {
+ OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+ OutStreamer.emitIntValue(4, 4); // data size
+ OutStreamer.emitIntValue(Flags, 4); // data
+ OutStreamer.emitIntValue(0, 4); // pad
+ }
+
+ // Emit the PAuth ABI compatibility info
+ if (PAuthABIPlatform != uint64_t(-1)) {
+ OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_PAUTH, 4);
+ OutStreamer.emitIntValue(8 * 2, 4); // data size
+ OutStreamer.emitIntValue(PAuthABIPlatform, 8);
+ OutStreamer.emitIntValue(PAuthABIVersion, 8);
+ }
OutStreamer.endSection(Nt);
OutStreamer.switchSection(Cur);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 7676d88..e8a9dc4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -35,7 +35,8 @@ public:
void emitCurrentConstantPool();
/// Callback used to implement the .note.gnu.property section.
- void emitNoteSection(unsigned Flags);
+ void emitNoteSection(unsigned Flags, uint64_t PAuthABIPlatform = -1,
+ uint64_t PAuthABIVersion = -1);
/// Callback used to implement the .inst directive.
virtual void emitInst(uint32_t Inst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 9083150..1114a8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1086,7 +1086,7 @@ void SplitPtrStructs::processConditionals() {
if (MaybeRsrc)
for (Value *V : Seen)
FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
- } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+ } else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
ConditionalTemps.push_back(cast<Instruction>(Rsrc));
Rsrc->replaceAllUsesWith(*MaybeRsrc);
@@ -1777,8 +1777,8 @@ void SplitPtrStructs::processFunction(Function &F) {
Originals.push_back(&I);
for (Instruction *I : Originals) {
auto [Rsrc, Off] = visit(I);
- assert((Rsrc && Off) ||
- (!Rsrc && !Off) && "Can't have a resource but no offset");
+ assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
+ "Can't have a resource but no offset");
if (Rsrc)
RsrcParts[I] = Rsrc;
if (Off)
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 294fc68..3866723 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4627,10 +4627,15 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
if (Src1Idx >= 0) {
const MCOperand &Src1 = Inst.getOperand(Src1Idx);
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (Src1.isImm() ||
- (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
- Error(Op.getStartLoc(), "invalid operand for instruction");
+ if (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI)) {
+ auto Reg = mc2PseudoReg(Inst.getOperand(Src1Idx).getReg());
+ SMLoc S = getRegLoc(Reg, Operands);
+ Error(S, "invalid operand for instruction");
+ return false;
+ }
+ if (Src1.isImm()) {
+ Error(getInstLoc(Operands),
+ "src1 immediate operand invalid for instruction");
return false;
}
}
diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
index f4f02d2..0541f0f 100644
--- a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
@@ -112,7 +112,7 @@ class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> :
lds.Mnemonic # asm,
ins,
lds.is_direct>,
- SIMCInstr <lds.Mnemonic, subtarget> {
+ SIMCInstr <lds.PseudoInstr, subtarget> {
let isPseudo = 0;
let isCodeGenOnly = 0;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e944dde..0773ef7 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1192,7 +1192,7 @@ def : GCNPat <
class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
string opName = ps.Mnemonic,
bit hasGDS = true>
- : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
+ : DS_Real<ps, opName>, SIMCInstr <ps.PseudoInstr, ef> {
let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
@@ -1557,7 +1557,7 @@ defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
DS_Real <ps>,
- SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> {
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d017ec4..27d5616 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2558,7 +2558,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = ""> :
+ string alias = name> :
VFLAT_Real_Base_gfx12<op, name, alias> {
defm _RTN : VFLAT_Real_gfx12<op, name>;
}
@@ -2581,7 +2581,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = ""> :
+ string alias = name> :
VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
defm _RTN : VFLAT_Real_gfx12<op, name>;
defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2762190..bb499c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -708,9 +708,6 @@ public:
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
bool FlushVmCnt);
- bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
OldWaitcntInstr);
}
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
- AMDGPU::Waitcnt Wait;
-
- unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
- unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
- unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
- if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
- return false;
-
- if (LoadCntPending != 0)
- Wait.LoadCnt = 0;
- if (SampleCntPending != 0)
- Wait.SampleCnt = 0;
- if (BvhCntPending != 0)
- Wait.BvhCnt = 0;
-
- return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
- OldWaitcntInstr);
-}
-
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
}
+ // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
+ // needed.
+ AMDGPU::Waitcnt Wait;
if (Block.getFirstTerminator() == Block.end() &&
- isPreheaderToFlush(Block, ScoreBrackets))
- Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+ isPreheaderToFlush(Block, ScoreBrackets)) {
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
+ }
+
+ // Combine or remove any redundant waitcnts at the end of the block.
+ Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+ OldWaitcntInstr);
return Modified;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 1694436..f1afbcc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2268,7 +2268,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret;
field Operand Src0ModVOP3DPP = getSrcModDPP<Src0VT>.ret;
- field Operand Src1ModVOP3DPP = getSrcModDPP<Src1VT>.ret;
+ field Operand Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT>.ret;
field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT>.ret;
field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d34ee34..0b7d45e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1972,7 +1972,7 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx11 : SOP1_Real<op, ps, name>,
- Select_gfx11<ps.Mnemonic>;
+ Select_gfx11<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
}
@@ -1980,14 +1980,14 @@ multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx12 : SOP1_Real<op, ps, name>,
- Select_gfx12<ps.Mnemonic>;
+ Select_gfx12<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
}
multiclass SOP1_M0_Real_gfx12<bits<8> op> {
def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic> {
+ Select_gfx12<!cast<SOP1_Pseudo>(NAME).PseudoInstr> {
let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
}
}
@@ -1995,7 +1995,7 @@ multiclass SOP1_M0_Real_gfx12<bits<8> op> {
multiclass SOP1_IMM_Real_gfx12<bits<8> op> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx12 : SOP1_Real<op, ps>,
- Select_gfx12<ps.Mnemonic>;
+ Select_gfx12<ps.PseudoInstr>;
}
multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> :
@@ -2106,7 +2106,7 @@ defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>;
multiclass SOP1_Real_gfx10<bits<8> op> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx10 : SOP1_Real<op, ps>,
- Select_gfx10<ps.Mnemonic>;
+ Select_gfx10<ps.PseudoInstr>;
}
multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> :
@@ -2139,7 +2139,7 @@ defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
defvar ps = !cast<SOP1_Pseudo>(NAME);
def _gfx6_gfx7 : SOP1_Real<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>;
+ Select_gfx6_gfx7<ps.PseudoInstr>;
}
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
@@ -2205,7 +2205,7 @@ defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOP2_Pseudo>(NAME);
def _gfx12 : SOP2_Real32<op, ps, name>,
- Select_gfx12<ps.Mnemonic>;
+ Select_gfx12<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
}
@@ -2222,7 +2222,7 @@ defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOP2_Pseudo>(NAME);
def _gfx11 : SOP2_Real32<op, ps, name>,
- Select_gfx11<ps.Mnemonic>;
+ Select_gfx11<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
}
@@ -2283,12 +2283,12 @@ defm S_MUL_U64 : SOP2_Real_gfx12<0x055>;
multiclass SOP2_Real_FMAK_gfx12<bits<7> op> {
def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
- Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+ Select_gfx12<!cast<SOP2_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOP2_Real_FMAK_gfx11<bits<7> op> {
def _gfx11 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
- Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+ Select_gfx11<!cast<SOP2_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> :
@@ -2325,7 +2325,7 @@ defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">;
multiclass SOP2_Real_gfx10<bits<7> op> {
defvar ps = !cast<SOP2_Pseudo>(NAME);
def _gfx10 : SOP2_Real32<op, ps>,
- Select_gfx10<ps.Mnemonic>;
+ Select_gfx10<ps.PseudoInstr>;
}
multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> :
@@ -2348,7 +2348,7 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOP2_Pseudo>(NAME);
def _gfx6_gfx7 : SOP2_Real32<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>;
+ Select_gfx6_gfx7<ps.PseudoInstr>;
}
multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
@@ -2410,24 +2410,24 @@ defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
def _gfx12 : SOPK_Real32<op, ps, name>,
- Select_gfx12<ps.Mnemonic>;
+ Select_gfx12<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
}
multiclass SOPK_Real32_gfx11<bits<5> op> {
def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOPK_Real64_gfx12<bits<5> op> {
def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ Select_gfx12<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOPK_Real64_gfx11<bits<5> op> {
def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
@@ -2454,13 +2454,13 @@ defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
multiclass SOPK_Real32_gfx10<bits<5> op> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
def _gfx10 : SOPK_Real32<op, ps>,
- Select_gfx10<ps.Mnemonic>;
+ Select_gfx10<ps.PseudoInstr>;
}
multiclass SOPK_Real64_gfx10<bits<5> op> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
def _gfx10 : SOPK_Real64<op, ps>,
- Select_gfx10<ps.Mnemonic>;
+ Select_gfx10<ps.PseudoInstr>;
}
multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
@@ -2485,13 +2485,13 @@ defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
def _gfx6_gfx7 : SOPK_Real32<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>;
+ Select_gfx6_gfx7<ps.PseudoInstr>;
}
multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
defvar ps = !cast<SOPK_Pseudo>(NAME);
def _gfx6_gfx7 : SOPK_Real64<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>;
+ Select_gfx6_gfx7<ps.PseudoInstr>;
}
multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
@@ -2539,7 +2539,7 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx12 : SOPP_Real_32<op, ps, name>,
- Select_gfx12<ps.Mnemonic>;
+ Select_gfx12<ps.PseudoInstr>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
}
@@ -2564,7 +2564,7 @@ defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>;
multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx11 : SOPP_Real_32<op, ps, name>,
- Select_gfx11<ps.Mnemonic>,
+ Select_gfx11<ps.PseudoInstr>,
SOPPRelaxTable<0, ps.KeyName, "_gfx11">;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
@@ -2572,13 +2572,13 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
multiclass SOPP_Real_64_gfx12<bits<7> op> {
def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
- Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select_gfx12<!cast<SOPP_Pseudo>(NAME).PseudoInstr>,
SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
}
multiclass SOPP_Real_64_gfx11<bits<7> op> {
def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
- Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select_gfx11<!cast<SOPP_Pseudo>(NAME).PseudoInstr>,
SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
}
@@ -2654,21 +2654,21 @@ defm S_SINGLEUSE_VDST : SOPP_Real_32_gfx11_gfx12<0x013>;
multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
- Select_gfx6_gfx7<ps.Mnemonic>,
+ Select_gfx6_gfx7<ps.PseudoInstr>,
SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _vi : SOPP_Real_32<op, ps>,
- Select_vi<ps.Mnemonic>,
+ Select_vi<ps.PseudoInstr>,
SOPPRelaxTable<0, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_32_gfx10<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx10 : SOPP_Real_32<op, ps>,
- Select_gfx10<ps.Mnemonic>,
+ Select_gfx10<ps.PseudoInstr>,
SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
}
@@ -2691,21 +2691,21 @@ multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> :
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx6_gfx7 : SOPP_Real_64<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>,
+ Select_gfx6_gfx7<ps.PseudoInstr>,
SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _vi : SOPP_Real_64<op, ps>,
- Select_vi<ps.Mnemonic>,
+ Select_vi<ps.PseudoInstr>,
SOPPRelaxTable<1, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_64_gfx10<bits<7> op> {
defvar ps = !cast<SOPP_Pseudo>(NAME);
def _gfx10 : SOPP_Real_64<op, ps>,
- Select_gfx10<ps.Mnemonic>,
+ Select_gfx10<ps.PseudoInstr>,
SOPPRelaxTable<1, ps.KeyName, "_gfx10">;
}
@@ -2771,12 +2771,12 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
multiclass SOPC_Real_gfx12<bits<7> op> {
def _gfx12 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx12<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ Select_gfx12<!cast<SOPC_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOPC_Real_gfx11<bits<7> op> {
def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ Select_gfx11<!cast<SOPC_Pseudo>(NAME).PseudoInstr>;
}
multiclass SOPC_Real_gfx11_gfx12<bits<7> op> :
@@ -2826,19 +2826,19 @@ defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>;
multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOPC_Pseudo>(NAME);
def _gfx6_gfx7 : SOPC_Real<op, ps>,
- Select_gfx6_gfx7<ps.Mnemonic>;
+ Select_gfx6_gfx7<ps.PseudoInstr>;
}
multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
defvar ps = !cast<SOPC_Pseudo>(NAME);
def _vi : SOPC_Real<op, ps>,
- Select_vi<ps.Mnemonic>;
+ Select_vi<ps.PseudoInstr>;
}
multiclass SOPC_Real_gfx10<bits<7> op> {
defvar ps = !cast<SOPC_Pseudo>(NAME);
def _gfx10 : SOPC_Real<op, ps>,
- Select_gfx10<ps.Mnemonic>;
+ Select_gfx10<ps.PseudoInstr>;
}
multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
@@ -2878,15 +2878,15 @@ defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
SOP1_Real<op, ps>,
- Select_vi<ps.Mnemonic>;
+ Select_vi<ps.PseudoInstr>;
class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
SOP2_Real32<op, ps>,
- Select_vi<ps.Mnemonic>;
+ Select_vi<ps.PseudoInstr>;
class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
SOPK_Real32<op, ps>,
- Select_vi<ps.Mnemonic>;
+ Select_vi<ps.PseudoInstr>;
def S_MOV_B32_vi : SOP1_Real_vi <0x00, S_MOV_B32>;
def S_MOV_B64_vi : SOP1_Real_vi <0x01, S_MOV_B64>;
@@ -3007,7 +3007,7 @@ def S_GETREG_B32_vi : SOPK_Real_vi <0x11, S_GETREG_B32>;
def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>;
//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
- Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+ Select_vi<S_SETREG_IMM32_B32.PseudoInstr>;
def S_CALL_B64_vi : SOPK_Real_vi <0x15, S_CALL_B64>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5d44396..4e00744 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -182,6 +182,8 @@ unsigned getAMDHSACodeObjectVersion(unsigned ABIVersion) {
return 4;
case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
return 5;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V6:
+ return 6;
default:
return getDefaultAMDHSACodeObjectVersion();
}
@@ -496,9 +498,7 @@ bool isVOPC64DPP(unsigned Opc) {
return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
}
-bool isVOPCAsmOnly(unsigned Opc) {
- return isVOPCAsmOnlyOpcodeHelper(Opc) || isVOP3CAsmOnlyOpcodeHelper(Opc);
-}
+bool isVOPCAsmOnly(unsigned Opc) { return isVOPCAsmOnlyOpcodeHelper(Opc); }
bool getMAIIsDGEMM(unsigned Opc) {
const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index f136a43..c001c5d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -503,6 +503,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
dpp8:$dpp8, Dpp8FI:$fi);
let Src2Mod = FP32InputMods; // dummy unused modifiers
let Src2RC64 = VGPRSrc_32; // stub argument
+ let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
}
def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0, HasExt32BitDPP = 0 in
@@ -618,7 +619,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
let AsmVOP3Base = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC64:$vdst);
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
@@ -652,7 +653,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
dpp8:$dpp8, Dpp8FI:$fi);
let Src0ModVOP3DPP = FPVRegInputMods;
- let Src1ModVOP3DPP = FPVRegInputMods;
+ let Src1ModVOP3DPP = FP32VCSrcInputMods;
let HasExt = 1;
let HasExtDPP = 1;
@@ -662,7 +663,17 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
}
def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
-def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
+def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
+ let IsTrue16 = 1;
+ let DstRC64 = getVALUDstForVT<DstVT>.ret;
+
+ let Src0Mod = getSrcMod<f16>.ret;
+ let Src1Mod = getSrcMod<f16>.ret;
+
+ let Src0VOP3DPP = VGPRSrc_32;
+ let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
+ let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret;
+}
def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
let Outs32 = (outs SReg_32:$vdst);
@@ -703,7 +714,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
//===----------------------------------------------------------------------===//
let SubtargetPredicate = isGFX11Plus in
-defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>;
+defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>;
defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 022fb7c..0b3a3d5 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -772,7 +772,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VRegSrc_32:$src1);
+ dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1);
let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
(ins)));
let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
@@ -1377,31 +1377,9 @@ multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
- defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
- let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
- defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
- def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
- let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
}
} // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
@@ -1472,35 +1450,9 @@ multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
- defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
- def _e64_dpp_w32#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp_w64#Gen.Suffix
- : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
- defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
- def _e64_dpp8_w32#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- def _e64_dpp8_w64#Gen.Suffix
- : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
- let AsmString = asm_name # " vcc, " # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
}
} // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
}
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a6272e9..60e91c7 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1680,7 +1680,6 @@ class AsmOnlyInfoTable <string Format, string Class>: GenericTable {
}
def VOPCAsmOnlyInfoTable : AsmOnlyInfoTable <"VOPC", "VOPC_DPPe_Common">;
-def VOP3CAsmOnlyInfoTable : AsmOnlyInfoTable <"VOP3C", "VOP3_DPPe_Common_Base">;
def VOPTrue16Table : GenericTable {
let FilterClass = "VOP_Pseudo";
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 9c29acb..bef7607 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -153,15 +153,15 @@ class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
class LWPC_ENC : PCREL19_FM<OPCODE2_LWPC>;
-class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
-class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MAX_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
-class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
-class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 7f35107..38c1f9868 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -139,20 +139,21 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
.clampScalar(0, s32, sXLen)
.minScalarSameAs(1, 0);
+ auto &ExtActions =
+ getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+ .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+ typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
if (ST.is64Bit()) {
- getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
- .legalFor({{sXLen, s32}})
- .maxScalar(0, sXLen);
-
+ ExtActions.legalFor({{sXLen, s32}});
getActionDefinitionsBuilder(G_SEXT_INREG)
.customFor({sXLen})
.maxScalar(0, sXLen)
.lower();
} else {
- getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}).maxScalar(0, sXLen);
-
getActionDefinitionsBuilder(G_SEXT_INREG).maxScalar(0, sXLen).lower();
}
+ ExtActions.customIf(typeIsLegalBoolVec(1, BoolVecTys, ST))
+ .maxScalar(0, sXLen);
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
@@ -235,7 +236,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getActionDefinitionsBuilder(G_ICMP)
.legalFor({{sXLen, sXLen}, {sXLen, p0}})
- .widenScalarToNextPow2(1)
+ .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
+ typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)))
+ .widenScalarOrEltToNextPow2OrMinSize(1, 8)
.clampScalar(1, sXLen, sXLen)
.clampScalar(0, sXLen, sXLen);
@@ -418,6 +421,29 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
.clampScalar(0, sXLen, sXLen)
.customFor({sXLen});
+ auto &SplatActions =
+ getActionDefinitionsBuilder(G_SPLAT_VECTOR)
+ .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+ typeIs(1, sXLen)))
+ .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), typeIs(1, s1)));
+ // Handle case of s64 element vectors on RV32. If the subtarget does not have
+ // f64, then try to lower it to G_SPLAT_VECTOR_SPLIT_64_VL. If the subtarget
+ // does have f64, then we don't know whether the type is an f64 or an i64,
+ // so mark the G_SPLAT_VECTOR as legal and decide later what to do with it,
+ // depending on how the instructions it consumes are legalized. They are not
+ // legalized yet since legalization is in reverse postorder, so we cannot
+ // make the decision at this moment.
+ if (XLen == 32) {
+ if (ST.hasVInstructionsF64() && ST.hasStdExtD())
+ SplatActions.legalIf(all(
+ typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
+ else if (ST.hasVInstructionsI64())
+ SplatActions.customIf(all(
+ typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
+ }
+
+ SplatActions.clampScalar(1, sXLen, sXLen);
+
getLegacyLegalizerInfo().computeTables();
}
@@ -576,7 +602,145 @@ bool RISCVLegalizerInfo::legalizeVScale(MachineInstr &MI,
auto VScale = MIB.buildLShr(XLenTy, VLENB, MIB.buildConstant(XLenTy, 3));
MIB.buildMul(Dst, VScale, MIB.buildConstant(XLenTy, Val));
}
+ MI.eraseFromParent();
+ return true;
+}
+
+// Custom-lower extensions from mask vectors by using a vselect either with 1
+// for zero/any-extension or -1 for sign-extension:
+// (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0)
+// Note that any-extension is lowered identically to zero-extension.
+bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI,
+ MachineIRBuilder &MIB) const {
+
+ unsigned Opc = MI.getOpcode();
+ assert(Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_SEXT ||
+ Opc == TargetOpcode::G_ANYEXT);
+
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ LLT DstTy = MRI.getType(Dst);
+ int64_t ExtTrueVal = Opc == TargetOpcode::G_SEXT ? -1 : 1;
+ LLT DstEltTy = DstTy.getElementType();
+ auto SplatZero = MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, 0));
+ auto SplatTrue =
+ MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, ExtTrueVal));
+ MIB.buildSelect(Dst, Src, SplatTrue, SplatZero);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Return the type of the mask type suitable for masking the provided
+/// vector type. This is simply an i1 element type vector of the same
+/// (possibly scalable) length.
+static LLT getMaskTypeFor(LLT VecTy) {
+ assert(VecTy.isVector());
+ ElementCount EC = VecTy.getElementCount();
+ return LLT::vector(EC, LLT::scalar(1));
+}
+
+/// Creates an all ones mask suitable for masking a vector of type VecTy with
+/// vector length VL.
+static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL,
+ MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) {
+ LLT MaskTy = getMaskTypeFor(VecTy);
+ return MIB.buildInstr(RISCV::G_VMSET_VL, {MaskTy}, {VL});
+}
+
+/// Gets the two common "VL" operands: an all-ones mask and the vector length.
+/// VecTy is a scalable vector type.
+static std::pair<MachineInstrBuilder, Register>
+buildDefaultVLOps(const DstOp &Dst, MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) {
+ LLT VecTy = Dst.getLLTTy(MRI);
+ assert(VecTy.isScalableVector() && "Expecting scalable container type");
+ Register VL(RISCV::X0);
+ MachineInstrBuilder Mask = buildAllOnesMask(VecTy, VL, MIB, MRI);
+ return {Mask, VL};
+}
+
+static MachineInstrBuilder
+buildSplatPartsS64WithVL(const DstOp &Dst, const SrcOp &Passthru, Register Lo,
+ Register Hi, Register VL, MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) {
+ // TODO: If the Hi bits of the splat are undefined, then it's fine to just
+ // splat Lo even if it might be sign extended. I don't think we have
+ // introduced a case where we're build a s64 where the upper bits are undef
+ // yet.
+
+ // Fall back to a stack store and stride x0 vector load.
+ // TODO: need to lower G_SPLAT_VECTOR_SPLIT_I64. This is done in
+ // preprocessDAG in SDAG.
+ return MIB.buildInstr(RISCV::G_SPLAT_VECTOR_SPLIT_I64_VL, {Dst},
+ {Passthru, Lo, Hi, VL});
+}
+
+static MachineInstrBuilder
+buildSplatSplitS64WithVL(const DstOp &Dst, const SrcOp &Passthru,
+ const SrcOp &Scalar, Register VL,
+ MachineIRBuilder &MIB, MachineRegisterInfo &MRI) {
+ assert(Scalar.getLLTTy(MRI) == LLT::scalar(64) && "Unexpected VecTy!");
+ auto Unmerge = MIB.buildUnmerge(LLT::scalar(32), Scalar);
+ return buildSplatPartsS64WithVL(Dst, Passthru, Unmerge.getReg(0),
+ Unmerge.getReg(1), VL, MIB, MRI);
+}
+
+// Lower splats of s1 types to G_ICMP. For each mask vector type, we have a
+// legal equivalently-sized i8 type, so we can use that as a go-between.
+// Splats of s1 types that have constant value can be legalized as VMSET_VL or
+// VMCLR_VL.
+bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI,
+ MachineIRBuilder &MIB) const {
+ assert(MI.getOpcode() == TargetOpcode::G_SPLAT_VECTOR);
+
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register SplatVal = MI.getOperand(1).getReg();
+
+ LLT VecTy = MRI.getType(Dst);
+ LLT XLenTy(STI.getXLenVT());
+
+ // Handle case of s64 element vectors on rv32
+ if (XLenTy.getSizeInBits() == 32 &&
+ VecTy.getElementType().getSizeInBits() == 64) {
+ auto [_, VL] = buildDefaultVLOps(Dst, MIB, MRI);
+ buildSplatSplitS64WithVL(Dst, MIB.buildUndef(VecTy), SplatVal, VL, MIB,
+ MRI);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // All-zeros or all-ones splats are handled specially.
+ MachineInstr &SplatValMI = *MRI.getVRegDef(SplatVal);
+ if (isAllOnesOrAllOnesSplat(SplatValMI, MRI)) {
+ auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
+ MIB.buildInstr(RISCV::G_VMSET_VL, {Dst}, {VL});
+ MI.eraseFromParent();
+ return true;
+ }
+ if (isNullOrNullSplat(SplatValMI, MRI)) {
+ auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
+ MIB.buildInstr(RISCV::G_VMCLR_VL, {Dst}, {VL});
+ MI.eraseFromParent();
+ return true;
+ }
+ // Handle non-constant mask splat (i.e. not sure if it's all zeros or all
+ // ones) by promoting it to an s8 splat.
+ LLT InterEltTy = LLT::scalar(8);
+ LLT InterTy = VecTy.changeElementType(InterEltTy);
+ auto ZExtSplatVal = MIB.buildZExt(InterEltTy, SplatVal);
+ auto And =
+ MIB.buildAnd(InterEltTy, ZExtSplatVal, MIB.buildConstant(InterEltTy, 1));
+ auto LHS = MIB.buildSplatVector(InterTy, And);
+ auto ZeroSplat =
+ MIB.buildSplatVector(InterTy, MIB.buildConstant(InterEltTy, 0));
+ MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, LHS, ZeroSplat);
MI.eraseFromParent();
return true;
}
@@ -640,6 +804,12 @@ bool RISCVLegalizerInfo::legalizeCustom(
return legalizeVAStart(MI, MIRBuilder);
case TargetOpcode::G_VSCALE:
return legalizeVScale(MI, MIRBuilder);
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_SEXT:
+ case TargetOpcode::G_ANYEXT:
+ return legalizeExt(MI, MIRBuilder);
+ case TargetOpcode::G_SPLAT_VECTOR:
+ return legalizeSplatVector(MI, MIRBuilder);
}
llvm_unreachable("expected switch to return");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index e2a98c8..5bb1e7a 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -43,6 +43,8 @@ private:
bool legalizeVAStart(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
+ bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
+ bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
};
} // end namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 888bcc4..86e4434 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -290,16 +290,7 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (Opc) {
case TargetOpcode::G_ADD:
- case TargetOpcode::G_SUB: {
- if (MRI.getType(MI.getOperand(0).getReg()).isVector()) {
- LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- return getInstructionMapping(
- DefaultMappingID, /*Cost=*/1,
- getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue()),
- NumOperands);
- }
- }
- LLVM_FALLTHROUGH;
+ case TargetOpcode::G_SUB:
case TargetOpcode::G_SHL:
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR:
@@ -320,14 +311,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_PTR_ADD:
case TargetOpcode::G_PTRTOINT:
case TargetOpcode::G_INTTOPTR:
- case TargetOpcode::G_TRUNC:
- case TargetOpcode::G_ANYEXT:
- case TargetOpcode::G_SEXT:
- case TargetOpcode::G_ZEXT:
- case TargetOpcode::G_SEXTLOAD:
- case TargetOpcode::G_ZEXTLOAD:
- return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
- NumOperands);
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
@@ -338,25 +321,48 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINNUM: {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- return getInstructionMapping(DefaultMappingID, /*Cost=*/1,
- getFPValueMapping(Ty.getSizeInBits()),
- NumOperands);
+ TypeSize Size = Ty.getSizeInBits();
+
+ const ValueMapping *Mapping;
+ if (Ty.isVector())
+ Mapping = getVRBValueMapping(Size.getKnownMinValue());
+ else if (isPreISelGenericFloatingPointOpcode(Opc))
+ Mapping = getFPValueMapping(Size.getFixedValue());
+ else
+ Mapping = GPRValueMapping;
+
+#ifndef NDEBUG
+ // Make sure all the operands are using similar size and type.
+ for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
+ LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
+ assert(Ty.isVector() == OpTy.isVector() &&
+ "Operand has incompatible type");
+ // Don't check size for GPR.
+ if (OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
+ assert(Size == OpTy.getSizeInBits() && "Operand has incompatible size");
+ }
+#endif // End NDEBUG
+
+ return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
}
+ case TargetOpcode::G_SEXTLOAD:
+ case TargetOpcode::G_ZEXTLOAD:
+ return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
+ NumOperands);
case TargetOpcode::G_IMPLICIT_DEF: {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
- uint64_t DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
+ unsigned DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
auto Mapping = GPRValueMapping;
// FIXME: May need to do a better job determining when to use FPRB.
// For example, the look through COPY case:
// %0:_(s32) = G_IMPLICIT_DEF
// %1:_(s32) = COPY %0
// $f10_d = COPY %1(s32)
- if (anyUseOnlyUseFP(Dst, MRI, TRI))
- Mapping = getFPValueMapping(DstMinSize);
-
if (DstTy.isVector())
Mapping = getVRBValueMapping(DstMinSize);
+ else if (anyUseOnlyUseFP(Dst, MRI, TRI))
+ Mapping = getFPValueMapping(DstMinSize);
return getInstructionMapping(DefaultMappingID, /*Cost=*/1, Mapping,
NumOperands);
@@ -529,7 +535,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (!Ty.isValid())
continue;
- if (isPreISelGenericFloatingPointOpcode(Opc))
+ if (Ty.isVector())
+ OpdsMapping[Idx] =
+ getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue());
+ else if (isPreISelGenericFloatingPointOpcode(Opc))
OpdsMapping[Idx] = getFPValueMapping(Ty.getSizeInBits());
else
OpdsMapping[Idx] = GPRValueMapping;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 55ba494..f99dc0b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3287,24 +3287,24 @@ bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits,
}
bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) {
- // Truncates are custom lowered during legalization.
- auto IsTrunc = [this](SDValue N) {
- if (N->getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+ auto IsExtOrTrunc = [](SDValue N) {
+ switch (N->getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ // There's no passthru on these _VL nodes so any VL/mask is ok, since any
+ // inactive elements will be undef.
+ case RISCVISD::TRUNCATE_VECTOR_VL:
+ case RISCVISD::VSEXT_VL:
+ case RISCVISD::VZEXT_VL:
+ return true;
+ default:
return false;
- SDValue VL;
- selectVLOp(N->getOperand(2), VL);
- // Any vmset_vl is ok, since any bits past VL are undefined and we can
- // assume they are set.
- return N->getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
- isa<ConstantSDNode>(VL) &&
- cast<ConstantSDNode>(VL)->getSExtValue() == RISCV::VLMaxSentinel;
+ }
};
- // We can have multiple nested truncates, so unravel them all if needed.
- while (N->getOpcode() == ISD::SIGN_EXTEND ||
- N->getOpcode() == ISD::ZERO_EXTEND || IsTrunc(N)) {
- if (!N.hasOneUse() ||
- N.getValueType().getSizeInBits().getKnownMinValue() < 8)
+ // We can have multiple nested nodes, so unravel them all if needed.
+ while (IsExtOrTrunc(N)) {
+ if (!N.hasOneUse() || N.getScalarValueSizeInBits() < 8)
return false;
N = N->getOperand(0);
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ee83f9d..279d8a4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21115,12 +21115,10 @@ void RVVArgDispatcher::constructArgInfos(ArrayRef<Type *> TypeList) {
RegisterVT.getVectorElementType() == MVT::i1) {
RVVArgInfos.push_back({1, RegisterVT, true});
FirstVMaskAssigned = true;
- } else {
- RVVArgInfos.push_back({1, RegisterVT, false});
+ --NumRegs;
}
- RVVArgInfos.insert(RVVArgInfos.end(), --NumRegs,
- {1, RegisterVT, false});
+ RVVArgInfos.insert(RVVArgInfos.end(), NumRegs, {1, RegisterVT, false});
}
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
index 54e22d6..ba40662 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
@@ -32,3 +32,28 @@ def G_READ_VLENB : RISCVGenericInstruction {
let hasSideEffects = false;
}
def : GINodeEquiv<G_READ_VLENB, riscv_read_vlenb>;
+
+// Pseudo equivalent to a RISCVISD::VMCLR_VL
+def G_VMCLR_VL : RISCVGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$vl);
+ let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VMCLR_VL, riscv_vmclr_vl>;
+
+// Pseudo equivalent to a RISCVISD::VMSET_VL
+def G_VMSET_VL : RISCVGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$vl);
+ let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VMSET_VL, riscv_vmset_vl>;
+
+// Pseudo equivalent to a RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL. There is no
+// record to mark as equivalent to using GINodeEquiv because it gets lowered
+// before instruction selection.
+def G_SPLAT_VECTOR_SPLIT_I64_VL : RISCVGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$passthru, type1:$hi, type1:$lo, type2:$vl);
+ let hasSideEffects = false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cc44092..73d52d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -387,6 +387,9 @@ def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
SDTCisVT<3, XLenVT>]>;
def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>;
def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
+def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C),
+ [(riscv_sext_vl node:$A, node:$B, node:$C),
+ (riscv_zext_vl node:$A, node:$B, node:$C)]>;
def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
SDTypeProfile<1, 3, [SDTCisVec<0>,
@@ -535,6 +538,11 @@ def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
return N->hasOneUse();
}]>;
+def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+ (riscv_ext_vl node:$A, node:$B, node:$C), [{
+ return N->hasOneUse();
+}]>;
+
def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
(riscv_fpextend_vl node:$A, node:$B, node:$C), [{
return N->hasOneUse();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 51a7a0a1..c1facc79 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -630,6 +630,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
def : Pat<(riscv_shl_vl
+ (wti.Vector (riscv_zext_vl_oneuse
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask V0), VLOpFrag)),
+ (wti.Vector (riscv_ext_vl_oneuse
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Mask V0), VLOpFrag)),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+ def : Pat<(riscv_shl_vl
(wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
(wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
(wti.Vector wti.RegClass:$merge),
@@ -639,6 +652,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
def : Pat<(riscv_shl_vl
+ (wti.Vector (riscv_zext_vl_oneuse
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask V0), VLOpFrag)),
+ (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+ def : Pat<(riscv_shl_vl
(wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
(wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
(wti.Vector wti.RegClass:$merge),
@@ -647,6 +671,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+ def : Pat<(riscv_shl_vl
+ (wti.Vector (riscv_zext_vl_oneuse
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Mask V0), VLOpFrag)),
+ (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
def : Pat<(riscv_vwsll_vl
(vti.Vector vti.RegClass:$rs2),
(vti.Vector vti.RegClass:$rs1),
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ba108912..85f8f5f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -254,6 +254,7 @@ public:
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
+ bool isTargetAndroid() const { return getTargetTriple().isAndroid(); }
bool isTargetFuchsia() const { return getTargetTriple().isOSFuchsia(); }
bool useConstantPoolForLargeInts() const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 38304ff..aeec063 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -245,6 +245,10 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
}
+bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
+ return ST->hasVInstructions();
+}
+
TargetTransformInfo::PopcntSupportKind
RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
@@ -861,9 +865,14 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
// TODO: add more intrinsic
case Intrinsic::experimental_stepvector: {
- unsigned Cost = 1; // vid
auto LT = getTypeLegalizationCost(RetTy);
- return Cost + (LT.first - 1);
+ // Legalisation of illegal types involves an `index' instruction plus
+ // (LT.first - 1) vector adds.
+ if (ST->hasVInstructions())
+ return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
+ (LT.first - 1) *
+ getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
+ return 1 + (LT.first - 1);
}
case Intrinsic::vp_rint: {
// RISC-V target uses at least 5 instructions to lower rounding intrinsics.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ac32aea..c0169ea 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -78,6 +78,22 @@ public:
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind);
+ /// \name EVL Support for predicated vectorization.
+ /// Whether the target supports the %evl parameter of VP intrinsic efficiently
+ /// in hardware, for the given opcode and type/alignment. (see LLVM Language
+ /// Reference - "Vector Predication Intrinsics",
+ /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and
+ /// "IR-level VP intrinsics",
+ /// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics).
+ /// \param Opcode the opcode of the instruction checked for predicated version
+ /// support.
+ /// \param DataType the type of the instruction with the \p Opcode checked for
+ /// prediction support.
+ /// \param Alignment the alignment for memory access operation checked for
+ /// predicated version support.
+ bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
+ Align Alignment) const;
+
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1674cef..9e4ba21 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -243,8 +243,12 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
continue;
MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
- SPIRVType *ElementType = GR->getOrCreateSPIRVType(
- cast<ConstantAsMetadata>(VMD->getMetadata())->getType(), MIRBuilder);
+ Type *ElementTy = cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
+ if (isUntypedPointerTy(ElementTy))
+ ElementTy =
+ TypedPointerType::get(IntegerType::getInt8Ty(II->getContext()),
+ getPointerAddressSpace(ElementTy));
+ SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder);
return GR->getOrCreateSPIRVPointerType(
ElementType, MIRBuilder,
addressSpaceToStorageClass(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index e0099e5..ac79937 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -47,7 +47,7 @@ class SPIRVGlobalRegistry {
DenseMap<const MachineOperand *, const Function *> InstrToFunction;
// Maps Functions to their calls (in a form of the machine instruction,
// OpFunctionCall) that happened before the definition is available
- DenseMap<const Function *, SmallVector<MachineInstr *>> ForwardCalls;
+ DenseMap<const Function *, SmallPtrSet<MachineInstr *, 8>> ForwardCalls;
// Look for an equivalent of the newType in the map. Return the equivalent
// if it's found, otherwise insert newType to the map and return the type.
@@ -215,12 +215,12 @@ public:
if (It == ForwardCalls.end())
ForwardCalls[F] = {MI};
else
- It->second.push_back(MI);
+ It->second.insert(MI);
}
// Map a Function to the vector of machine instructions that represents
// forward function calls or to nullptr if not found.
- SmallVector<MachineInstr *> *getForwardCalls(const Function *F) {
+ SmallPtrSet<MachineInstr *, 8> *getForwardCalls(const Function *F) {
auto It = ForwardCalls.find(F);
return It == ForwardCalls.end() ? nullptr : &It->second;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 90a3155..d450078 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -193,7 +193,7 @@ void validateForwardCalls(const SPIRVSubtarget &STI,
MachineRegisterInfo *DefMRI, SPIRVGlobalRegistry &GR,
MachineInstr &FunDef) {
const Function *F = GR.getFunctionByDefinition(&FunDef);
- if (SmallVector<MachineInstr *> *FwdCalls = GR.getForwardCalls(F))
+ if (SmallPtrSet<MachineInstr *, 8> *FwdCalls = GR.getForwardCalls(F))
for (MachineInstr *FunCall : *FwdCalls) {
MachineRegisterInfo *CallMRI =
&FunCall->getParent()->getParent()->getRegInfo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f4525e71..49749b5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1825,7 +1825,24 @@ bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
- return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+ // Change order of instructions if needed: all OpVariable instructions in a
+ // function must be the first instructions in the first block
+ MachineFunction *MF = I.getParent()->getParent();
+ MachineBasicBlock *MBB = &MF->front();
+ auto It = MBB->SkipPHIsAndLabels(MBB->begin()), E = MBB->end();
+ bool IsHeader = false;
+ unsigned Opcode;
+ for (; It != E && It != I; ++It) {
+ Opcode = It->getOpcode();
+ if (Opcode == SPIRV::OpFunction || Opcode == SPIRV::OpFunctionParameter) {
+ IsHeader = true;
+ } else if (IsHeader &&
+ !(Opcode == SPIRV::ASSIGN_TYPE || Opcode == SPIRV::OpLabel)) {
+ ++It;
+ break;
+ }
+ }
+ return BuildMI(*MBB, It, It->getDebugLoc(), TII.get(SPIRV::OpVariable))
.addDef(ResVReg)
.addUse(GR.getSPIRVTypeID(ResType))
.addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 215a8ea..6855471 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
default:
// See if this is a generic print operand
return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+ case 'L': // Low order register of a twin word register operand
+ case 'H': // High order register of a twin word register operand
+ {
+ const SparcSubtarget &Subtarget = MF->getSubtarget<SparcSubtarget>();
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo();
+ Register MOReg = MO.getReg();
+
+ Register HiReg, LoReg;
+ if (!SP::IntPairRegClass.contains(MOReg)) {
+ // If we aren't given a register pair already, find out which pair it
+ // belongs to. Note that here, the specified register operand, which
+ // refers to the high part of the twinword, needs to be an even-numbered
+ // register.
+ MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even,
+ &SP::IntPairRegClass);
+ if (!MOReg) {
+ SMLoc Loc;
+ OutContext.reportError(
+ Loc, "Hi part of pair should point to an even-numbered register");
+ OutContext.reportError(
+ Loc, "(note that in some cases it might be necessary to manually "
+ "bind the input/output registers instead of relying on "
+ "automatic allocation)");
+ return true;
+ }
+ }
+
+ HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even);
+ LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd);
+
+ Register Reg;
+ switch (ExtraCode[0]) {
+ case 'L':
+ Reg = LoReg;
+ break;
+ case 'H':
+ Reg = HiReg;
+ break;
+ }
+
+ O << '%' << SparcInstPrinter::getRegisterName(Reg);
+ return false;
+ }
case 'f':
case 'r':
break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9751e1..6f65344 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
switch (Op.getOpcode()) {
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
return false;
}
return TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce3b6af..270dd32 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2161,6 +2161,11 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
+
+ def : Pat<(or_is_add GR8:$src, 1), (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
+ def : Pat<(or_is_add GR16:$src, 1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
+ def : Pat<(or_is_add GR32:$src, 1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
+ def : Pat<(or_is_add GR64:$src, 1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
}
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f243343..a5b2e48 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6276,10 +6276,10 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
case X86::RCPSSm:
case X86::RCPSSr_Int:
case X86::RCPSSm_Int:
- case X86::ROUNDSDr:
- case X86::ROUNDSDm:
- case X86::ROUNDSSr:
- case X86::ROUNDSSm:
+ case X86::ROUNDSDri:
+ case X86::ROUNDSDmi:
+ case X86::ROUNDSSri:
+ case X86::ROUNDSSmi:
case X86::RSQRTSSr:
case X86::RSQRTSSm:
case X86::RSQRTSSr_Int:
@@ -6778,14 +6778,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
case X86::VRCPSSr_Int:
case X86::VRCPSSm:
case X86::VRCPSSm_Int:
- case X86::VROUNDSDr:
- case X86::VROUNDSDm:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSDm_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSm:
- case X86::VROUNDSSr_Int:
- case X86::VROUNDSSm_Int:
+ case X86::VROUNDSDri:
+ case X86::VROUNDSDmi:
+ case X86::VROUNDSDri_Int:
+ case X86::VROUNDSDmi_Int:
+ case X86::VROUNDSSri:
+ case X86::VROUNDSSmi:
+ case X86::VROUNDSSri_Int:
+ case X86::VROUNDSSmi_Int:
case X86::VRSQRTSSr:
case X86::VRSQRTSSr_Int:
case X86::VRSQRTSSm:
@@ -7516,8 +7516,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VRCPSSr_Int:
case X86::RSQRTSSr_Int:
case X86::VRSQRTSSr_Int:
- case X86::ROUNDSSr_Int:
- case X86::VROUNDSSr_Int:
+ case X86::ROUNDSSri_Int:
+ case X86::VROUNDSSri_Int:
case X86::COMISSrr_Int:
case X86::VCOMISSrr_Int:
case X86::VCOMISSZrr_Int:
@@ -7685,8 +7685,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VCVTSD2USI64Zrr_Int:
case X86::VCVTTSD2USIZrr_Int:
case X86::VCVTTSD2USI64Zrr_Int:
- case X86::ROUNDSDr_Int:
- case X86::VROUNDSDr_Int:
+ case X86::ROUNDSDri_Int:
+ case X86::VROUNDSDri_Int:
case X86::COMISDrr_Int:
case X86::VCOMISDrr_Int:
case X86::VCOMISDZrr_Int:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 69d4536..2b391b6 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5475,35 +5475,35 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
let Uses = [MXCSR], mayRaiseFPException = 1 in {
- def r : SS4AIi8<opc, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
- Sched<[sched]>;
+ def ri : SS4AIi8<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
+ Sched<[sched]>;
// Vector intrinsic operation, mem
- def m : SS4AIi8<opc, MRMSrcMem,
- (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst,
- (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
- Sched<[sched.Folded]>;
+ def mi : SS4AIi8<opc, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
+ Sched<[sched.Folded]>;
}
}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
- def SSr : SS4AIi8<opcss, MRMSrcReg,
+ def SSri : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[sched]>;
let mayLoad = 1 in
- def SSm : SS4AIi8<opcss, MRMSrcMem,
+ def SSmi : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5511,14 +5511,14 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
- def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ def SDri : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[sched]>;
let mayLoad = 1 in
- def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ def SDmi : SS4AIi8<opcsd, MRMSrcMem,
(outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5530,44 +5530,44 @@ multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
- def SSr : SS4AIi8<opcss, MRMSrcReg,
- (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[sched]>;
+ def SSri : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched]>;
let mayLoad = 1 in
- def SSm : SS4AIi8<opcss, MRMSrcMem,
- (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def SSmi : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
- def SDr : SS4AIi8<opcsd, MRMSrcReg,
- (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[sched]>;
+ def SDri : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched]>;
let mayLoad = 1 in
- def SDm : SS4AIi8<opcsd, MRMSrcMem,
- (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def SDmi : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
}
-multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr, X86FoldableSchedWrite sched,
- ValueType VT32, ValueType VT64,
- SDNode OpNode, bit Is2Addr = 1> {
+multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr, X86FoldableSchedWrite sched,
+ ValueType VT32, ValueType VT64,
+ SDNode OpNode, bit Is2Addr = 1> {
let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle in {
- def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ def SSri_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -5577,7 +5577,7 @@ let ExeDomain = SSEPackedSingle in {
[(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
- def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+ def SSmi_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -5590,7 +5590,7 @@ let ExeDomain = SSEPackedSingle in {
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
let ExeDomain = SSEPackedDouble in {
- def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ def SDri_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -5600,7 +5600,7 @@ let ExeDomain = SSEPackedDouble in {
[(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
- def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+ def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -5636,25 +5636,25 @@ let Predicates = [HasAVX, NoVLX] in {
}
}
let Predicates = [UseAVX] in {
- defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
- v4f32, v2f64, X86RndScales, 0>,
- VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
+ defm VROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales, 0>,
+ VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
}
let Predicates = [UseAVX] in {
def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+ (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
+ (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+ (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+ (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
@@ -5667,21 +5667,21 @@ defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
- v4f32, v2f64, X86RndScales>;
+defm ROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
- (ROUNDSSr FR32:$src1, timm:$src2)>;
+ (ROUNDSSri FR32:$src1, timm:$src2)>;
def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
- (ROUNDSDr FR64:$src1, timm:$src2)>;
+ (ROUNDSDri FR64:$src1, timm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
- (ROUNDSSm addr:$src1, timm:$src2)>;
+ (ROUNDSSmi addr:$src1, timm:$src2)>;
def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
- (ROUNDSDm addr:$src1, timm:$src2)>;
+ (ROUNDSDmi addr:$src1, timm:$src2)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 0027de8..63ac910 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -324,14 +324,14 @@ defm : BWWriteResPair<WriteFMAX, [BWPort01], 5, [1], 1, 5>; // Fused Multiply
defm : BWWriteResPair<WriteFMAY, [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
defm : X86WriteResPairUnsupported<WriteFMAZ>;
defm : BWWriteResPair<WriteDPPD, [BWPort0,BWPort1,BWPort5], 9, [1,1,1], 3, 5>; // Floating point double dot product.
-defm : BWWriteResPair<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product.
-defm : BWWriteResPair<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM).
+defm : X86WriteRes<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSLd, [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 19, [2,1,1,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd, [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 20, [2,1,1,1,1], 6>;
defm : BWWriteResPair<WriteFSign, [BWPort5], 1>; // Floating point fabs/fchs.
-defm : X86WriteRes<WriteFRnd, [BWPort23], 6, [1], 1>; // Floating point rounding.
-defm : X86WriteRes<WriteFRndY, [BWPort23], 6, [1], 1>; // Floating point rounding (YMM/ZMM).
+defm : BWWriteResPair<WriteFRnd, [BWPort1], 6, [2], 2, 5>; // Floating point rounding.
+defm : BWWriteResPair<WriteFRndY, [BWPort1], 6, [2], 2, 6>; // Floating point rounding (YMM/ZMM).
defm : X86WriteResPairUnsupported<WriteFRndZ>;
-defm : X86WriteRes<WriteFRndLd, [BWPort1,BWPort23], 11, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
defm : BWWriteResPair<WriteFLogic, [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
defm : BWWriteResPair<WriteFLogicY, [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
defm : X86WriteResPairUnsupported<WriteFLogicZ>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index a11b470..516dc62 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -324,15 +324,14 @@ defm : HWWriteResPair<WriteFMAX, [HWPort01], 5, [1], 1, 6>;
defm : HWWriteResPair<WriteFMAY, [HWPort01], 5, [1], 1, 7>;
defm : HWWriteResPair<WriteFMAZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
defm : HWWriteResPair<WriteDPPD, [HWPort0,HWPort1,HWPort5], 9, [1,1,1], 3, 6>;
-defm : HWWriteResPair<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
-defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : X86WriteRes<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSLd, [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 20, [2,1,1,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd, [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 21, [2,1,1,1,1], 6>;
defm : HWWriteResPair<WriteFSign, [HWPort0], 1>;
-defm : X86WriteRes<WriteFRnd, [HWPort23], 6, [1], 1>;
-defm : X86WriteRes<WriteFRndY, [HWPort23], 6, [1], 1>;
-defm : X86WriteRes<WriteFRndZ, [HWPort23], 6, [1], 1>; // Unsupported = 1
-defm : X86WriteRes<WriteFRndLd, [HWPort1,HWPort23], 12, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
-defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFRnd, [HWPort1], 6, [2], 2, 6>;
+defm : HWWriteResPair<WriteFRndY, [HWPort1], 6, [2], 2, 7>;
+defm : HWWriteResPair<WriteFRndZ, [HWPort1], 6, [2], 2, 7>; // Unsupported = 1
defm : HWWriteResPair<WriteFLogic, [HWPort5], 1, [1], 1, 6>;
defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 88bb9ad..ff3fe32 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2290,8 +2290,8 @@ def SPRWriteResGroup218 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
let Latency = 15;
let NumMicroOps = 3;
}
-def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)m$")>;
-def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)m((_Int)?)$",
+def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)mi$")>;
+def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)mi((_Int)?)$",
"^VRNDSCALEP(D|S)Z128rm(bi|ik)$",
"^VRNDSCALEP(D|S)Z128rmbik(z?)$",
"^VRNDSCALEP(D|S)Z128rmi((kz)?)$",
@@ -2303,13 +2303,13 @@ def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> {
let Latency = 8;
let NumMicroOps = 2;
}
-def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)r$",
- "^(V?)ROUND(PS|SD)r$",
- "^(V?)ROUNDS(D|S)r_Int$",
+def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$",
+ "^(V?)ROUND(PS|SD)ri$",
+ "^(V?)ROUNDS(D|S)ri_Int$",
"^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$",
"^VRNDSCALES(D|S)Zr$",
"^VRNDSCALES(D|S)Zr(b?)_Int((k|kz)?)$",
- "^VROUNDP(D|S)Yr$")>;
+ "^VROUNDP(D|S)Yri$")>;
def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> {
let ReleaseAtCycles = [2];
@@ -3737,7 +3737,7 @@ def SPRWriteResGroup390 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
let NumMicroOps = 3;
}
def : InstRW<[SPRWriteResGroup390], (instregex "^VF(C?)MADDCPHZ(128|256)m(b?)$",
- "^VROUNDP(D|S)Ym$")>;
+ "^VROUNDP(D|S)Ymi$")>;
def : InstRW<[SPRWriteResGroup390, ReadAfterVecXLd], (instregex "^VF(C?)MADDCSHZm$",
"^VF(C?)MULCPHZ128rm(b?)$",
"^VF(C?)MULCSHZrm$",
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 4fa138f..3ee931f 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -311,8 +311,10 @@ defm : SKLWriteResPair<WriteFMAX, [SKLPort01], 4, [1], 1, 6>;
defm : SKLWriteResPair<WriteFMAY, [SKLPort01], 4, [1], 1, 7>;
defm : X86WriteResPairUnsupported<WriteFMAZ>;
defm : SKLWriteResPair<WriteDPPD, [SKLPort5,SKLPort01], 9, [1,2], 3, 6>; // Floating point double dot product.
-defm : SKLWriteResPair<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4, 6>;
-defm : SKLWriteResPair<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4, 7>;
+defm : X86WriteRes<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSLd, [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 19, [1,3,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd, [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 20, [1,3,1,1], 6>;
defm : SKLWriteResPair<WriteFSign, [SKLPort0], 1>; // Floating point fabs/fchs.
defm : SKLWriteResPair<WriteFRnd, [SKLPort01], 8, [2], 2, 6>; // Floating point rounding.
defm : SKLWriteResPair<WriteFRndY, [SKLPort01], 8, [2], 2, 7>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 3da688c..a7dff0e 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -311,8 +311,10 @@ defm : SKXWriteResPair<WriteFMAX, [SKXPort01], 4, [1], 1, 6>;
defm : SKXWriteResPair<WriteFMAY, [SKXPort01], 4, [1], 1, 7>;
defm : SKXWriteResPair<WriteFMAZ, [SKXPort05], 4, [1], 1, 7>;
defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015], 9, [1,2], 3, 6>; // Floating point double dot product.
-defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>;
-defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : X86WriteRes<WriteDPPS, [SKXPort5,SKXPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSY, [SKXPort5,SKXPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSLd, [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 19, [1,3,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd, [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 20, [1,3,1,1], 6>;
defm : SKXWriteResPair<WriteFSign, [SKXPort0], 1>; // Floating point fabs/fchs.
defm : SKXWriteResPair<WriteFRnd, [SKXPort01], 8, [2], 2, 6>; // Floating point rounding.
defm : SKXWriteResPair<WriteFRndY, [SKXPort01], 8, [2], 2, 7>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index d90c8bd..2e87d52 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -52,7 +52,7 @@ def Znver3Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
- // FIXME
+ // FIXME:
let HighLatency = 25; // FIXME: any better choice?
// AMD SOG 19h, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
@@ -193,11 +193,11 @@ def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
// <...>, and six FPU pipes.
// Agner, 22.10 Floating point execution pipes
// There are six floating point/vector execution pipes,
-def Zn3FPP0 : ProcResource<1>;
-def Zn3FPP1 : ProcResource<1>;
-def Zn3FPP2 : ProcResource<1>;
-def Zn3FPP3 : ProcResource<1>;
-def Zn3FPP45 : ProcResource<2>;
+def Zn3FP0 : ProcResource<1>;
+def Zn3FP1 : ProcResource<1>;
+def Zn3FP2 : ProcResource<1>;
+def Zn3FP3 : ProcResource<1>;
+def Zn3FP45 : ProcResource<2>;
//
// Execution Units
@@ -205,63 +205,63 @@ def Zn3FPP45 : ProcResource<2>;
// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
-defvar Zn3FPFMul0 = Zn3FPP0;
-defvar Zn3FPFMul1 = Zn3FPP1;
+defvar Zn3FPFMul0 = Zn3FP0;
+defvar Zn3FPFMul1 = Zn3FP1;
// (v)FADD*
-defvar Zn3FPFAdd0 = Zn3FPP2;
-defvar Zn3FPFAdd1 = Zn3FPP3;
+defvar Zn3FPFAdd0 = Zn3FP2;
+defvar Zn3FPFAdd1 = Zn3FP3;
// All convert operations except pack/unpack
-defvar Zn3FPFCvt0 = Zn3FPP2;
-defvar Zn3FPFCvt1 = Zn3FPP3;
+defvar Zn3FPFCvt0 = Zn3FP2;
+defvar Zn3FPFCvt1 = Zn3FP3;
// All Divide and Square Root except Reciprocal Approximation
// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
// FDIV unit can support 2 simultaneous operations in flight
// even though it occupies a single pipe.
// FIXME: BufferSize=2 ?
-defvar Zn3FPFDiv = Zn3FPP1;
+defvar Zn3FPFDiv = Zn3FP1;
// Moves and Logical operations on Floating Point Data Types
-defvar Zn3FPFMisc0 = Zn3FPP0;
-defvar Zn3FPFMisc1 = Zn3FPP1;
-defvar Zn3FPFMisc2 = Zn3FPP2;
-defvar Zn3FPFMisc3 = Zn3FPP3;
+defvar Zn3FPFMisc0 = Zn3FP0;
+defvar Zn3FPFMisc1 = Zn3FP1;
+defvar Zn3FPFMisc2 = Zn3FP2;
+defvar Zn3FPFMisc3 = Zn3FP3;
// Integer Adds, Subtracts, and Compares
// Some complex VADD operations are not available in all pipes.
-defvar Zn3FPVAdd0 = Zn3FPP0;
-defvar Zn3FPVAdd1 = Zn3FPP1;
-defvar Zn3FPVAdd2 = Zn3FPP2;
-defvar Zn3FPVAdd3 = Zn3FPP3;
+defvar Zn3FPVAdd0 = Zn3FP0;
+defvar Zn3FPVAdd1 = Zn3FP1;
+defvar Zn3FPVAdd2 = Zn3FP2;
+defvar Zn3FPVAdd3 = Zn3FP3;
// Integer Multiplies, SAD, Blendvb
-defvar Zn3FPVMul0 = Zn3FPP0;
-defvar Zn3FPVMul1 = Zn3FPP3;
+defvar Zn3FPVMul0 = Zn3FP0;
+defvar Zn3FPVMul1 = Zn3FP3;
// Data Shuffles, Packs, Unpacks, Permute
// Some complex shuffle operations are only available in pipe1.
-defvar Zn3FPVShuf = Zn3FPP1;
-defvar Zn3FPVShufAux = Zn3FPP2;
+defvar Zn3FPVShuf = Zn3FP1;
+defvar Zn3FPVShufAux = Zn3FP2;
// Bit Shift Left/Right operations
-defvar Zn3FPVShift0 = Zn3FPP1;
-defvar Zn3FPVShift1 = Zn3FPP2;
+defvar Zn3FPVShift0 = Zn3FP1;
+defvar Zn3FPVShift1 = Zn3FP2;
// Moves and Logical operations on Packed Integer Data Types
-defvar Zn3FPVMisc0 = Zn3FPP0;
-defvar Zn3FPVMisc1 = Zn3FPP1;
-defvar Zn3FPVMisc2 = Zn3FPP2;
-defvar Zn3FPVMisc3 = Zn3FPP3;
+defvar Zn3FPVMisc0 = Zn3FP0;
+defvar Zn3FPVMisc1 = Zn3FP1;
+defvar Zn3FPVMisc2 = Zn3FP2;
+defvar Zn3FPVMisc3 = Zn3FP3;
// *AES*
-defvar Zn3FPAES0 = Zn3FPP0;
-defvar Zn3FPAES1 = Zn3FPP1;
+defvar Zn3FPAES0 = Zn3FP0;
+defvar Zn3FPAES1 = Zn3FP1;
// *CLM*
-defvar Zn3FPCLM0 = Zn3FPP0;
-defvar Zn3FPCLM1 = Zn3FPP1;
+defvar Zn3FPCLM0 = Zn3FP0;
+defvar Zn3FPCLM1 = Zn3FP1;
// Execution pipeline grouping
//===----------------------------------------------------------------------===//
@@ -269,7 +269,7 @@ defvar Zn3FPCLM1 = Zn3FPP1;
// AMD SOG 19h, 2.11 Floating-Point Unit
// Stores and floating point to general purpose register transfer
// have 2 dedicated pipelines (pipe 5 and 6).
-def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
+def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>;
// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
@@ -293,12 +293,12 @@ def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
// AMD SOG 19h, 2.11 Floating-Point Unit
// Stores and floating point to general purpose register transfer
// have 2 dedicated pipelines (pipe 5 and 6).
-defvar Zn3FPLd01 = Zn3FPP45;
+defvar Zn3FPLd01 = Zn3FP45;
// AMD SOG 19h, 2.11 Floating-Point Unit
// Note that FP stores are supported on two pipelines,
// but throughput is limited to one per cycle.
-let Super = Zn3FPP45 in
+let Super = Zn3FP45 in
def Zn3FPSt : ProcResource<1>;
// Integer Adds, Subtracts, and Compares
@@ -345,8 +345,8 @@ def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
// AMD SOG 19h, 2.11 Floating-Point Unit
// <...> the scheduler can issue 1 micro op per cycle for each pipe.
// FIXME: those are two separate schedulers, not a single big one.
-def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0
- Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1
+def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2, /*Zn3FP4,*/ // scheduler 0
+ Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/ // scheduler 1
]> {
let BufferSize = !mul(2, 32);
}
@@ -838,9 +838,9 @@ defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
// Floating point. This covers both scalar and vector operations.
-defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 276bc7f..86b4560 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -211,8 +211,9 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, bool JIT,
}
static CodeModel::Model
-getEffectiveX86CodeModel(std::optional<CodeModel::Model> CM, bool JIT,
- bool Is64Bit) {
+getEffectiveX86CodeModel(const Triple &TT, std::optional<CodeModel::Model> CM,
+ bool JIT) {
+ bool Is64Bit = TT.getArch() == Triple::x86_64;
if (CM) {
if (*CM == CodeModel::Tiny)
report_fatal_error("Target does not support the tiny CodeModel", false);
@@ -234,7 +235,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(
T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(TT, JIT, RM),
- getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
+ getEffectiveX86CodeModel(TT, CM, JIT),
OL),
TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
// On PS4/PS5, the "return address" of a 'noreturn' call must still be within
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2ec2946..cd61029 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2664,9 +2664,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
};
static const TypeConversionCostTblEntry AVXConversionTbl[] = {
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },