aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td24
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td2777
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td2705
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp6
-rw-r--r--llvm/lib/Target/AArch64/MachineSMEABIPass.cpp150
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp78
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp238
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td185
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp25
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp111
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td6
-rw-r--r--llvm/lib/Target/ARM/ARMInstrVFP.td175
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td10
-rw-r--r--llvm/lib/Target/DirectX/DXILOpLowering.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILShaderFlags.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp255
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.h17
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp29
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td20
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp1
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp23
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td19
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp36
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp69
42 files changed, 6621 insertions, 459 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 8d0ff41..1396841 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
-FunctionPass *createMachineSMEABIPass();
+FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index a4529a5..0f457c2 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -133,6 +133,8 @@ include "AArch64SchedNeoverseN2.td"
include "AArch64SchedNeoverseN3.td"
include "AArch64SchedNeoverseV1.td"
include "AArch64SchedNeoverseV2.td"
+include "AArch64SchedNeoverseV3.td"
+include "AArch64SchedNeoverseV3AE.td"
include "AArch64SchedOryon.td"
include "AArch64Processors.td"
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index b3ec65c..2783147 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs, extract_vec_elt_combines,
push_freeze_to_prevent_poison_from_propagating,
- combine_mul_cmlt, combine_use_vector_truncate,
- extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
+ combine_mul_cmlt, combine_use_vector_truncate,
+ extmultomull, truncsat_combines, lshr_of_trunc_of_lshr,
+ funnel_shift_from_or_shift_constants_are_legal]> {
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d16b116..60aa61e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
CallingConv::ID CallerCC = CallerF.getCallingConv();
// SME Streaming functions are not eligible for TCO as they may require
- // the streaming mode or ZA to be restored after returning from the call.
+ // the streaming mode or ZA/ZT0 to be restored after returning from the call.
SMECallAttrs CallAttrs =
getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingAllZAState() ||
+ CallAttrs.requiresPreservingZT0() ||
CallAttrs.caller().hasStreamingBody())
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 457e540..ccc8eb8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -122,7 +122,7 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
NumBytes = Desc.getSize() ? Desc.getSize() : 4;
const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
- if (!MFI->shouldSignReturnAddress(MF))
+ if (!MFI->shouldSignReturnAddress(*MF))
return NumBytes;
const auto &STI = MF->getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b9e299e..2871a20 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1805,14 +1805,22 @@ def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>;
def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>;
def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>;
-class EOR3_pattern<ValueType VecTy>
- : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)),
- (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
-
-def : EOR3_pattern<v16i8>;
-def : EOR3_pattern<v8i16>;
-def : EOR3_pattern<v4i32>;
-def : EOR3_pattern<v2i64>;
+multiclass EOR3_pattern<ValueType Vec128Ty, ValueType Vec64Ty>{
+ def : Pat<(xor (xor (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm)), (Vec128Ty V128:$Va)),
+ (EOR3 (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm), (Vec128Ty V128:$Va))>;
+ def : Pat<(xor (xor (Vec64Ty V64:$Vn), (Vec64Ty V64:$Vm)), (Vec64Ty V64:$Va)),
+ (EXTRACT_SUBREG
+ (EOR3
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vn, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vm, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Va, dsub)),
+ dsub)>;
+}
+
+defm : EOR3_pattern<v16i8, v8i8>;
+defm : EOR3_pattern<v8i16, v4i16>;
+defm : EOR3_pattern<v4i32, v2i32>;
+defm : EOR3_pattern<v2i64, v1i64>;
class BCAX_pattern<ValueType VecTy>
: Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))),
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 81f5d07..11387bb 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1272,11 +1272,11 @@ def : ProcessorModel<"cortex-x2", NeoverseV2Model, ProcessorFeatures.X2,
[TuneX2]>;
def : ProcessorModel<"cortex-x3", NeoverseV2Model, ProcessorFeatures.X3,
[TuneX3]>;
-def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4,
+def : ProcessorModel<"cortex-x4", NeoverseV3Model, ProcessorFeatures.X4,
[TuneX4]>;
-def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925,
+def : ProcessorModel<"cortex-x925", NeoverseV3Model, ProcessorFeatures.X925,
[TuneX925]>;
-def : ProcessorModel<"gb10", NeoverseV2Model, ProcessorFeatures.GB10,
+def : ProcessorModel<"gb10", NeoverseV3Model, ProcessorFeatures.GB10,
[TuneX925]>;
def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace,
[TuneNeoverseV2]>;
@@ -1295,9 +1295,9 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model,
ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
def : ProcessorModel<"neoverse-v2", NeoverseV2Model,
ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>;
-def : ProcessorModel<"neoverse-v3", NeoverseV2Model,
+def : ProcessorModel<"neoverse-v3", NeoverseV3Model,
ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>;
-def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model,
+def : ProcessorModel<"neoverse-v3ae", NeoverseV3AEModel,
ProcessorFeatures.NeoverseV3AE, [TuneNeoverseV3AE]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
[TuneExynosM3]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td
new file mode 100644
index 0000000..e23576a
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td
@@ -0,0 +1,2777 @@
+//=- AArch64SchedNeoverseV3.td - NeoverseV3 Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse V3 processors.
+// All information is taken from the V3 Software Optimization guide:
+//
+// https://developer.arm.com/documentation/109678/300/?lang=en
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseV3Model : SchedMachineModel {
+ let IssueWidth = 10; // Expect best value to be slightly higher than V2
+ let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
+ let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+ [HasSVE2p1, HasSVEB16B16,
+ HasCPA, HasCSSC]);
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse V3.
+// Instructions are first fetched and then decoded into internal macro-ops
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch
+// stages. A MOP can be split into two micro-ops further down the pipeline
+// after the decode stage. Once dispatched, micro-ops wait for their operands
+// and issue out-of-order to one of twenty-one issue pipelines. Each issue
+// pipeline can accept one micro-op per cycle.
+
+let SchedModel = NeoverseV3Model in {
+
+// Define the (21) issue ports.
+def V3UnitB : ProcResource<3>; // Branch 0/1/2
+def V3UnitS0 : ProcResource<1>; // Integer single-cycle 0
+def V3UnitS1 : ProcResource<1>; // Integer single-cycle 1
+def V3UnitS2 : ProcResource<1>; // Integer single-cycle 2
+def V3UnitS3 : ProcResource<1>; // Integer single-cycle 3
+def V3UnitS4 : ProcResource<1>; // Integer single-cycle 4
+def V3UnitS5 : ProcResource<1>; // Integer single-cycle 5
+def V3UnitM0 : ProcResource<1>; // Integer single/multicycle 0
+def V3UnitM1 : ProcResource<1>; // Integer single/multicycle 1
+def V3UnitV0 : ProcResource<1>; // FP/ASIMD 0
+def V3UnitV1 : ProcResource<1>; // FP/ASIMD 1
+def V3UnitV2 : ProcResource<1>; // FP/ASIMD 2
+def V3UnitV3 : ProcResource<1>; // FP/ASIMD 3
+def V3UnitLS0 : ProcResource<1>; // Load/Store 0
+def V3UnitL12 : ProcResource<2>; // Load 1/2
+def V3UnitST1 : ProcResource<1>; // Store 1
+def V3UnitD : ProcResource<2>; // Store data 0/1
+def V3UnitFlg : ProcResource<4>; // Flags
+
+def V3UnitS : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5]>; // Integer single-cycle 0/1/2/3/4/5
+def V3UnitI : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5, V3UnitM0, V3UnitM1]>; // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1
+def V3UnitM : ProcResGroup<[V3UnitM0, V3UnitM1]>; // Integer single/multicycle 0/1
+def V3UnitLSA : ProcResGroup<[V3UnitLS0, V3UnitL12, V3UnitST1]>; // Supergroup of L+SA
+def V3UnitL : ProcResGroup<[V3UnitLS0, V3UnitL12]>; // Load/Store 0 and Load 1/2
+def V3UnitSA : ProcResGroup<[V3UnitLS0, V3UnitST1]>; // Load/Store 0 and Store 1
+def V3UnitV : ProcResGroup<[V3UnitV0, V3UnitV1, V3UnitV2, V3UnitV3]>; // FP/ASIMD 0/1/2/3
+def V3UnitV01 : ProcResGroup<[V3UnitV0, V3UnitV1]>; // FP/ASIMD 0/1
+def V3UnitV02 : ProcResGroup<[V3UnitV0, V3UnitV2]>; // FP/ASIMD 0/2
+def V3UnitV13 : ProcResGroup<[V3UnitV1, V3UnitV3]>; // FP/ASIMD 1/3
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// NOTE: Copied from N2.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse V3.
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 0 micro-op types
+def V3Write_0c : SchedWriteRes<[]> { let Latency = 0; }
+
+// Define generic 1 micro-op types
+
+def V3Write_1c_1B : SchedWriteRes<[V3UnitB]> { let Latency = 1; }
+def V3Write_1c_1F_1Flg : SchedWriteRes<[V3UnitI, V3UnitFlg]> { let Latency = 1; }
+def V3Write_1c_1I : SchedWriteRes<[V3UnitI]> { let Latency = 1; }
+def V3Write_1c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 1; }
+def V3Write_1c_1SA : SchedWriteRes<[V3UnitSA]> { let Latency = 1; }
+def V3Write_2c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 2; }
+def V3Write_2c_1M_1Flg : SchedWriteRes<[V3UnitM, V3UnitFlg]> { let Latency = 2; }
+def V3Write_3c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 3; }
+def V3Write_2c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 2; }
+def V3Write_3c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 3; }
+def V3Write_4c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 4; }
+def V3Write_12c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 12;
+ let ReleaseAtCycles = [12]; }
+def V3Write_20c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 20;
+ let ReleaseAtCycles = [20]; }
+def V3Write_4c_1L : SchedWriteRes<[V3UnitL]> { let Latency = 4; }
+def V3Write_6c_1L : SchedWriteRes<[V3UnitL]> { let Latency = 6; }
+def V3Write_2c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 2; }
+def V3Write_2c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 2; }
+def V3Write_3c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Write_3c_1V01 : SchedWriteRes<[V3UnitV01]> { let Latency = 3;
+ let ReleaseAtCycles = [2]; }
+def V3Write_4c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Write_5c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Write_6c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 6; }
+def V3Write_12c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 12; }
+def V3Write_3c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 3; }
+def V3Write_3c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 3; }
+def V3Write_4c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 4; }
+def V3Write_4c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Write_9c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 9; }
+def V3Write_10c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 10; }
+def V3Write_8c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 8; }
+def V3Write_12c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 12;
+ let ReleaseAtCycles = [11]; }
+def V3Write_13c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 13; }
+def V3Write_15c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 15; }
+def V3Write_13c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 13; }
+def V3Write_16c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 16; }
+def V3Write_16c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 16;
+ let ReleaseAtCycles = [8]; }
+def V3Write_20c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 20;
+ let ReleaseAtCycles = [20]; }
+def V3Write_2c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 2; }
+def V3Write_2c_1V13 : SchedWriteRes<[V3UnitV13]> { let Latency = 2; }
+def V3Write_3c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 3; }
+def V3Write_3c_1V13 : SchedWriteRes<[V3UnitV13]> { let Latency = 3; }
+def V3Write_4c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 4; }
+def V3Write_6c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 6; }
+def V3Write_10c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 10; }
+def V3Write_6c_1SA : SchedWriteRes<[V3UnitSA]> { let Latency = 6; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def V3Write_1c_1B_1S : SchedWriteRes<[V3UnitB, V3UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1M0_1B : SchedWriteRes<[V3UnitM0, V3UnitB]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_9c_1M0_1L : SchedWriteRes<[V3UnitM0, V3UnitL]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+def V3Write_3c_1I_1M : SchedWriteRes<[V3UnitI, V3UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3Write_1c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3Write_3c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_7c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def V3Write_1c_1SA_1D : SchedWriteRes<[V3UnitSA, V3UnitD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_1M0_1V : SchedWriteRes<[V3UnitM0, V3UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_2c_1SA_1V01 : SchedWriteRes<[V3UnitSA, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3Write_2c_2V01 : SchedWriteRes<[V3UnitV01, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_1SA_1V01 : SchedWriteRes<[V3UnitSA, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_1V13_1V : SchedWriteRes<[V3UnitV13, V3UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_2L : SchedWriteRes<[V3UnitL, V3UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_8c_1L_1V : SchedWriteRes<[V3UnitL, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_1SA_1V : SchedWriteRes<[V3UnitSA, V3UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_3c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3Write_4c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3Write_1c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3Write_2c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V1 : SchedWriteRes<[V3UnitV1, V3UnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_5c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_7c_1M0_1V02 : SchedWriteRes<[V3UnitM0, V3UnitV02]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def V3Write_2c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3Write_3c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1V_1V13 : SchedWriteRes<[V3UnitV, V3UnitV13]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1L_1M : SchedWriteRes<[V3UnitL, V3UnitM]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V13 : SchedWriteRes<[V3UnitV13, V3UnitV13]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3Write_8c_1M0_1V01 : SchedWriteRes<[V3UnitM0, V3UnitV01]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def V3Write_1c_1SA_1D_1I : SchedWriteRes<[V3UnitSA, V3UnitD, V3UnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_1V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3Write_4c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def V3Write_9c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+def V3Write_4c_3V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def V3Write_7c_1M_1M0_1V : SchedWriteRes<[V3UnitM, V3UnitM0, V3UnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_1I_1V01 : SchedWriteRes<[V3UnitSA, V3UnitI, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3Write_6c_3L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def V3Write_6c_3V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def V3Write_8c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def V3Write_2c_1SA_2V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def V3Write_2c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def V3Write_4c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def V3Write_5c_1I_3L : SchedWriteRes<[V3UnitI, V3UnitL, V3UnitL, V3UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V0 : SchedWriteRes<[V3UnitV0, V3UnitV0, V3UnitV0, V3UnitV0]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3Write_8c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+ V3UnitV13]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3Write_8c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+ V3UnitV13]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V02 : SchedWriteRes<[V3UnitV02, V3UnitV02, V3UnitV02,
+ V3UnitV02]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3Write_8c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3Write_9c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+
+def V3Write_2c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV,
+ V3UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def V3Write_4c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV,
+ V3UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def V3Write_8c_2M0_2V02 : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitV02,
+ V3UnitV02]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3Write_8c_2V_2V1 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV1,
+ V3UnitV1]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3Write_4c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM,
+ V3UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def V3Write_5c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM,
+ V3UnitM]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_2I_2L : SchedWriteRes<[V3UnitI, V3UnitI, V3UnitL, V3UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3Write_7c_4L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+
+def V3Write_6c_1SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitV01]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def V3Write_2c_1SA_2V01_2I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitI, V3UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+}
+
+def V3Write_8c_2L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+def V3Write_9c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+
+def V3Write_10c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 5;
+}
+
+def V3Write_6c_5V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def V3Write_8c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def V3Write_9c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3Write_9c_2L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3Write_9c_2L_2V_2I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV,
+ V3UnitV, V3UnitI, V3UnitI]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3Write_9c_2V_4V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+ V3UnitV13, V3UnitV13, V3UnitV13]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3Write_2c_3SA_3V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 6;
+}
+
+def V3Write_4c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def V3Write_5c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def V3Write_2c_3SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 6;
+}
+
+def V3Write_4c_2SA_2I_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitI,
+ V3UnitI, V3UnitV01, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def V3Write_8c_3L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def V3Write_2c_4SA_4V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 8;
+}
+
+def V3Write_2c_4SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01]> {
+ let Latency = 2;
+ let NumMicroOps = 8;
+}
+
+def V3Write_6c_2SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+}
+
+def V3Write_8c_4L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL,
+ V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def V3Write_6c_3SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 6;
+ let NumMicroOps = 9;
+}
+
+def V3Write_10c_1L_8V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 9;
+}
+
+def V3Write_10c_3V_3L_3I : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV,
+ V3UnitL, V3UnitL, V3UnitL,
+ V3UnitI, V3UnitI, V3UnitI]> {
+ let Latency = 10;
+ let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def V3Write_9c_6L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL,
+ V3UnitL, V3UnitL, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def V3Write_5c_4SA_8V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 5;
+ let NumMicroOps = 12;
+}
+
+def V3Write_9c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitL, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 12;
+}
+
+def V3Write_10c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitL, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 12;
+}
+
+def V3Write_4c_6SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 4;
+ let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def V3Write_7c_4SA_12V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01]> {
+ let Latency = 7;
+ let NumMicroOps = 16;
+}
+
+def V3Write_10c_4L_8V_4I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+ V3UnitL, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV,
+ V3UnitV, V3UnitV, V3UnitV,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI]> {
+ let Latency = 10;
+ let NumMicroOps = 16;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def V3Write_7c_9SA_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01]> {
+ let Latency = 7;
+ let NumMicroOps = 18;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 27 micro-op types
+
+def V3Write_7c_9SA_9I_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01]> {
+ let Latency = 7;
+ let NumMicroOps = 27;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 36 micro-op types
+
+def V3Write_11c_18SA_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA, V3UnitSA,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01]> {
+ let Latency = 11;
+ let NumMicroOps = 36;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 54 micro-op types
+
+def V3Write_11c_18SA_18I_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitSA, V3UnitSA,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitI, V3UnitI, V3UnitI,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01,
+ V3UnitV01, V3UnitV01]> {
+ let Latency = 11;
+ let NumMicroOps = 54;
+}
+
+//===----------------------------------------------------------------------===//
+// Define predicate-controlled types
+
+def V3Write_ArithI : SchedWriteVariant<[
+ SchedVar<IsCheapLSL, [V3Write_1c_1I]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1M]>]>;
+
+def V3Write_ArithF : SchedWriteVariant<[
+ SchedVar<IsCheapLSL, [V3Write_1c_1F_1Flg]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1M_1Flg]>]>;
+
+def V3Write_Logical : SchedWriteVariant<[
+ SchedVar<NeoverseNoLSL, [V3Write_1c_1F_1Flg]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1M_1Flg]>]>;
+
+def V3Write_Extr : SchedWriteVariant<[
+ SchedVar<IsRORImmIdiomPred, [V3Write_1c_1I]>,
+ SchedVar<NoSchedPred, [V3Write_3c_1I_1M]>]>;
+
+def V3Write_LdrHQ : SchedWriteVariant<[
+ SchedVar<NeoverseHQForm, [V3Write_7c_1I_1L]>,
+ SchedVar<NoSchedPred, [V3Write_6c_1L]>]>;
+
+def V3Write_StrHQ : SchedWriteVariant<[
+ SchedVar<NeoverseHQForm, [V3Write_2c_1SA_1V01_1I]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1SA_1V01]>]>;
+
+def V3Write_0or1c_1I : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+ SchedVar<NoSchedPred, [V3Write_1c_1I]>]>;
+
+def V3Write_0or2c_1V : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1V]>]>;
+
+def V3Write_0or3c_1M0 : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+ SchedVar<NoSchedPred, [V3Write_3c_1M0]>]>;
+
+def V3Write_2or3c_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3Write_3c_1M]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1M]>]>;
+
+def V3Write_1or2c_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3Write_2c_1M]>,
+ SchedVar<NoSchedPred, [V3Write_1c_1M]>]>;
+
+def V3Write_3or4c_1M0_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3Write_4c_1M0_1M]>,
+ SchedVar<NoSchedPred, [V3Write_3c_1M0_1M]>]>;
+
+def V3Write_2or3c_1V0 : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3Write_3c_1V0]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1V0]>]>;
+
+def V3Write_2or3c_1V0_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3Write_3c_1V0_1M]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1V0_1M]>]>;
+
+def V3Write_IncDec : SchedWriteVariant<[
+ SchedVar<NeoverseCheapIncDec, [V3Write_1c_1I]>,
+ SchedVar<NoSchedPred, [V3Write_2c_1M]>]>;
+
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V3Wr_IM : SchedWriteRes<[V3UnitM]> { let Latency = 2; }
+
+def V3Wr_FMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_FMA : SchedReadAdvance<2, [WriteFMul, V3Wr_FMA]>;
+
+def V3Wr_VA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VA : SchedReadAdvance<3, [V3Wr_VA]>;
+
+def V3Wr_VDOT : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_VDOT : SchedReadAdvance<2, [V3Wr_VDOT]>;
+
+def V3Wr_VMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_VMMA : SchedReadAdvance<2, [V3Wr_VMMA]>;
+
+def V3Wr_VMA : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMA : SchedReadAdvance<3, [V3Wr_VMA]>;
+
+def V3Wr_VMAH : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMAH : SchedReadAdvance<2, [V3Wr_VMAH]>;
+
+def V3Wr_VMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMAL : SchedReadAdvance<3, [V3Wr_VMAL]>;
+
+def V3Wr_VPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VPA : SchedReadAdvance<3, [V3Wr_VPA]>;
+
+def V3Wr_VSA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VSA : SchedReadAdvance<3, [V3Wr_VSA]>;
+
+def V3Wr_VFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFCMA : SchedReadAdvance<2, [V3Wr_VFCMA]>;
+
+def V3Wr_VFM : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Wr_VFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFMA : SchedReadAdvance<2, [V3Wr_VFM, V3Wr_VFMA]>;
+
+def V3Wr_VFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFMAL : SchedReadAdvance<2, [V3Wr_VFMAL]>;
+
+def V3Wr_VBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_VBFDOT : SchedReadAdvance<2, [V3Wr_VBFDOT]>;
+def V3Wr_VBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; }
+def V3Rd_VBFMMA : SchedReadAdvance<2, [V3Wr_VBFMMA]>;
+def V3Wr_VBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_VBFMAL : SchedReadAdvance<3, [V3Wr_VBFMAL]>;
+
+def V3Wr_CRC : SchedWriteRes<[V3UnitM0]> { let Latency = 2; }
+def V3Rd_CRC : SchedReadAdvance<1, [V3Wr_CRC]>;
+
+def V3Wr_ZA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZA : SchedReadAdvance<3, [V3Wr_ZA]>;
+def V3Wr_ZPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZPA : SchedReadAdvance<3, [V3Wr_ZPA]>;
+def V3Wr_ZSA : SchedWriteRes<[V3UnitV13]> { let Latency = 4; }
+def V3Rd_ZSA : SchedReadAdvance<3, [V3Wr_ZSA]>;
+
+def V3Wr_ZDOTB : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_ZDOTB : SchedReadAdvance<2, [V3Wr_ZDOTB]>;
+def V3Wr_ZDOTH : SchedWriteRes<[V3UnitV02]> { let Latency = 3; }
+def V3Rd_ZDOTH : SchedReadAdvance<2, [V3Wr_ZDOTH]>;
+
+// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
+// throughput to 1 in case of forwarding?
+def V3Wr_ZCMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZCMABHS : SchedReadAdvance<3, [V3Wr_ZCMABHS]>;
+def V3Wr_ZCMAD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZCMAD : SchedReadAdvance<2, [V3Wr_ZCMAD]>;
+
+def V3Wr_ZMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_ZMMA : SchedReadAdvance<2, [V3Wr_ZMMA]>;
+
+def V3Wr_ZMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZMABHS : SchedReadAdvance<3, [V3Wr_ZMABHS]>;
+def V3Wr_ZMAD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZMAD : SchedReadAdvance<2, [V3Wr_ZMAD]>;
+
+def V3Wr_ZMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZMAL : SchedReadAdvance<3, [V3Wr_ZMAL]>;
+
+def V3Wr_ZMASQL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Wr_ZMASQBHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Wr_ZMASQD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZMASQ : SchedReadAdvance<2, [V3Wr_ZMASQL, V3Wr_ZMASQBHS,
+ V3Wr_ZMASQD]>;
+
+def V3Wr_ZFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZFCMA : SchedReadAdvance<3, [V3Wr_ZFCMA]>;
+
+def V3Wr_ZFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZFMA : SchedReadAdvance<2, [V3Wr_ZFMA]>;
+
+def V3Wr_ZFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZFMAL : SchedReadAdvance<2, [V3Wr_ZFMAL]>;
+
+def V3Wr_ZBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZBFDOT : SchedReadAdvance<2, [V3Wr_ZBFDOT]>;
+def V3Wr_ZBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; }
+def V3Rd_ZBFMMA : SchedReadAdvance<2, [V3Wr_ZBFMMA]>;
+def V3Wr_ZBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZBFMAL : SchedReadAdvance<3, [V3Wr_ZBFMAL]>;
+
+//===----------------------------------------------------------------------===//
+// Define types with long resource cycles (rc)
+
+def V3Write_6c_1V1_5rc : SchedWriteRes<[V3UnitV1]> { let Latency = 6; let ReleaseAtCycles = [ 5]; }
+def V3Write_9c_1V1_2rc : SchedWriteRes<[V3UnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 2]; }
+def V3Write_9c_1V1_4rc : SchedWriteRes<[V3UnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 4]; }
+def V3Write_10c_1V1_9rc : SchedWriteRes<[V3UnitV1]> { let Latency = 10; let ReleaseAtCycles = [ 9]; }
+def V3Write_11c_1V1_4rc : SchedWriteRes<[V3UnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; }
+def V3Write_13c_1V1_8rc : SchedWriteRes<[V3UnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; }
+def V3Write_14c_1V1_2rc : SchedWriteRes<[V3UnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; }
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// §3.3 Branch instructions
+// -----------------------------------------------------------------------------
+
+// Branch, immed
+// Compare and branch
+def : SchedAlias<WriteBr, V3Write_1c_1B>;
+
+// Branch, register
+def : SchedAlias<WriteBrReg, V3Write_1c_1B>;
+
+// Branch and link, immed
+// Branch and link, register
+def : InstRW<[V3Write_1c_1B_1S], (instrs BL, BLR)>;
+
+// §3.4 Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+// ALU, basic
+def : SchedAlias<WriteI, V3Write_1c_1I>;
+
+// ALU, basic, flagset
+def : InstRW<[V3Write_1c_1F_1Flg],
+ (instregex "^(ADD|SUB)S[WX]r[ir]$",
+ "^(ADC|SBC)S[WX]r$",
+ "^ANDS[WX]ri$",
+ "^(AND|BIC)S[WX]rr$")>;
+def : InstRW<[V3Write_0or1c_1I], (instregex "^MOVZ[WX]i$")>;
+
+// ALU, extend and shift
+def : SchedAlias<WriteIEReg, V3Write_2c_1M>;
+
+// Arithmetic, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
+def : SchedAlias<WriteISReg, V3Write_ArithI>;
+def : InstRW<[V3Write_ArithF],
+ (instregex "^(ADD|SUB)S[WX]rs$")>;
+
+// Arithmetic, immediate to logical address tag
+def : InstRW<[V3Write_2c_1M], (instrs ADDG, SUBG)>;
+
+// Conditional compare
+def : InstRW<[V3Write_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>;
+
+// Convert floating-point condition flags
+// Flag manipulation instructions
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+// Insert Random Tags
+def : InstRW<[V3Write_2c_1M], (instrs IRG, IRGstack)>;
+
+// Insert Tag Mask
+// Subtract Pointer
+def : InstRW<[V3Write_1c_1I], (instrs GMI, SUBP)>;
+
+// Subtract Pointer, flagset
+def : InstRW<[V3Write_1c_1F_1Flg], (instrs SUBPS)>;
+
+// Logical, shift, no flagset
+def : InstRW<[V3Write_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
+def : InstRW<[V3Write_0or1c_1I], (instregex "^ORR[WX]rs$")>;
+
+// Logical, shift, flagset
+def : InstRW<[V3Write_Logical], (instregex "^(AND|BIC)S[WX]rs$")>;
+
+// Move and shift instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteImm, V3Write_1c_1I>;
+
+// §3.5 Divide and multiply instructions
+// -----------------------------------------------------------------------------
+
+// SDIV, UDIV
+def : SchedAlias<WriteID32, V3Write_12c_1M0>;
+def : SchedAlias<WriteID64, V3Write_20c_1M0>;
+
+def : SchedAlias<WriteIM32, V3Write_2c_1M>;
+def : SchedAlias<WriteIM64, V3Write_2c_1M>;
+
+// Multiply
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+def : InstRW<[V3Wr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V3Wr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
+// Multiply high
+def : InstRW<[V3Write_3c_1M], (instrs SMULHrr, UMULHrr)>;
+
+// §3.6 Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[V3Write_4c_1M0], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[V3Write_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+ BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+ ERETAA, ERETAB)>;
+
+
+// Load register, with pointer authentication
+def : InstRW<[V3Write_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[V3Write_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>;
+
+// §3.7 Miscellaneous data-processing instructions
+// -----------------------------------------------------------------------------
+
+// Address generation
+def : InstRW<[V3Write_1c_1I], (instrs ADR, ADRP)>;
+
+// Bitfield extract, one reg
+// Bitfield extract, two regs
+def : SchedAlias<WriteExtr, V3Write_Extr>;
+def : InstRW<[V3Write_Extr], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitfield move, basic
+def : SchedAlias<WriteIS, V3Write_1c_1I>;
+
+// Bitfield move, insert
+def : InstRW<[V3Write_2c_1M], (instregex "^BFM[WX]ri$")>;
+
+// §3.8 Load instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3.
+
+def : SchedAlias<WriteLD, V3Write_4c_1L>;
+def : SchedAlias<WriteLDIdx, V3Write_4c_1L>;
+
+// Load register, literal
+def : InstRW<[V3Write_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
+
+// Load pair, signed immed offset, signed words
+def : InstRW<[V3Write_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>;
+
+// Load pair, immed post-index or immed pre-index, signed words
+def : InstRW<[WriteAdr, V3Write_5c_1I_3L, WriteLDHi],
+ (instregex "^LDPSW(post|pre)$")>;
+
+// §3.9 Store instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I.
+
+def : SchedAlias<WriteST, V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteSTIdx, V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteSTP, V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteAdr, V3Write_1c_1I>;
+
+// §3.10 Tag load instructions
+// -----------------------------------------------------------------------------
+
+// Load allocation tag
+// Load multiple allocation tags
+def : InstRW<[V3Write_4c_1L], (instrs LDG, LDGM)>;
+
+// §3.11 Tag store instructions
+// -----------------------------------------------------------------------------
+
+// Store allocation tags to one or two granules, post-index
+// Store allocation tags to one or two granules, pre-index
+// Store allocation tag to one or two granules, zeroing, post-index
+// Store Allocation Tag to one or two granules, zeroing, pre-index
+// Store allocation tag and reg pair to memory, post-Index
+// Store allocation tag and reg pair to memory, pre-Index
+def : InstRW<[V3Write_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex,
+ ST2GPreIndex, ST2GPostIndex,
+ STZGPreIndex, STZGPostIndex,
+ STZ2GPreIndex, STZ2GPostIndex,
+ STGPpre, STGPpost)>;
+
+// Store allocation tags to one or two granules, signed offset
+// Store allocation tag to two granules, zeroing, signed offset
+// Store allocation tag and reg pair to memory, signed offset
+// Store multiple allocation tags
+def : InstRW<[V3Write_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi,
+ STZ2Gi, STGPi, STGM, STZGM)>;
+
+// §3.12 FP data processing instructions
+// -----------------------------------------------------------------------------
+
+// FP absolute value
+// FP arithmetic
+// FP min/max
+// FP negate
+// FP select
+def : SchedAlias<WriteF, V3Write_2c_1V>;
+
+// FP compare
+def : SchedAlias<WriteFCmp, V3Write_2c_1V0>;
+
+// FP divide, square root
+def : SchedAlias<WriteFDiv, V3Write_6c_1V1>;
+
+// FP divide, H-form
+def : InstRW<[V3Write_6c_1V1], (instrs FDIVHrr)>;
+// FP divide, S-form
+def : InstRW<[V3Write_8c_1V1], (instrs FDIVSrr)>;
+// FP divide, D-form
+def : InstRW<[V3Write_13c_1V1], (instrs FDIVDrr)>;
+
+// FP square root, H-form
+def : InstRW<[V3Write_6c_1V1], (instrs FSQRTHr)>;
+// FP square root, S-form
+def : InstRW<[V3Write_8c_1V1], (instrs FSQRTSr)>;
+// FP square root, D-form
+def : InstRW<[V3Write_13c_1V1], (instrs FSQRTDr)>;
+
+// FP multiply
+def : WriteRes<WriteFMul, [V3UnitV]> { let Latency = 3; }
+
+// FP multiply accumulate
+def : InstRW<[V3Wr_FMA, ReadDefault, ReadDefault, V3Rd_FMA],
+ (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+
+// FP round to integral
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
+ "^FRINT(32|64)[XZ][SD]r$")>;
+
+// §3.13 FP miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// FP convert, from gen to vec reg
+def : InstRW<[V3Write_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
+
+// FP convert, from vec to gen reg
+def : InstRW<[V3Write_3c_1V01],
+ (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>;
+
+// FP convert, Javascript from vec to gen reg
+def : SchedAlias<WriteFCvt, V3Write_3c_1V0>;
+
+// FP convert, from vec to vec reg
+def : InstRW<[V3Write_3c_1V02], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr,
+ FCVTHDr, FCVTSDr, FCVTXNv1i64)>;
+
+// FP move, immed
+// FP move, register
+def : SchedAlias<WriteFImm, V3Write_2c_1V>;
+
+// FP transfer, from gen to low half of vec reg
+def : InstRW<[V3Write_0or3c_1M0],
+ (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
+
+// FP transfer, from gen to high half of vec reg
+def : InstRW<[V3Write_5c_1M0_1V], (instrs FMOVXDHighr)>;
+
+// FP transfer, from vec to gen reg
+def : SchedAlias<WriteFCopy, V3Write_2c_2V01>;
+
+// §3.14 FP load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector reg, literal, S/D/Q forms
+def : InstRW<[V3Write_7c_1I_1L], (instregex "^LDR[SDQ]l$")>;
+
+// Load vector reg, unscaled immed
+def : InstRW<[V3Write_6c_1L], (instregex "^LDUR[BHSDQ]i$")>;
+
+// Load vector reg, immed post-index
+// Load vector reg, immed pre-index
+def : InstRW<[WriteAdr, V3Write_6c_1I_1L],
+ (instregex "^LDR[BHSDQ](pre|post)$")>;
+
+// Load vector reg, unsigned immed
+def : InstRW<[V3Write_6c_1L], (instregex "^LDR[BHSDQ]ui$")>;
+
+// Load vector reg, register offset, basic
+// Load vector reg, register offset, scale, S/D-form
+// Load vector reg, register offset, scale, H/Q-form
+// Load vector reg, register offset, extend
+// Load vector reg, register offset, extend, scale, S/D-form
+// Load vector reg, register offset, extend, scale, H/Q-form
+def : InstRW<[V3Write_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>;
+
+// Load vector pair, immed offset, S/D-form
+def : InstRW<[V3Write_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
+
+// Load vector pair, immed offset, Q-form
+def : InstRW<[V3Write_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
+
+// Load vector pair, immed post-index, S/D-form
+// Load vector pair, immed pre-index, S/D-form
+def : InstRW<[WriteAdr, V3Write_6c_1I_1L, WriteLDHi],
+ (instregex "^LDP[SD](pre|post)$")>;
+
+// Load vector pair, immed post-index, Q-form
+// Load vector pair, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3Write_6c_2I_2L, WriteLDHi], (instrs LDPQpost,
+ LDPQpre)>;
+
+// §3.15 FP store instructions
+// -----------------------------------------------------------------------------
+
+// Store vector reg, unscaled immed, B/H/S/D-form
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STUR[BHSDQ]i$")>;
+
+// Store vector reg, immed post-index, B/H/S/D-form
+// Store vector reg, immed post-index, Q-form
+// Store vector reg, immed pre-index, B/H/S/D-form
+// Store vector reg, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I],
+ (instregex "^STR[BHSDQ](pre|post)$")>;
+
+// Store vector reg, unsigned immed, B/H/S/D-form
+// Store vector reg, unsigned immed, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STR[BHSDQ]ui$")>;
+
+// Store vector reg, register offset, basic, B/H/S/D-form
+// Store vector reg, register offset, basic, Q-form
+// Store vector reg, register offset, scale, H-form
+// Store vector reg, register offset, scale, S/D-form
+// Store vector reg, register offset, scale, Q-form
+// Store vector reg, register offset, extend, B/H/S/D-form
+// Store vector reg, register offset, extend, Q-form
+// Store vector reg, register offset, extend, scale, H-form
+// Store vector reg, register offset, extend, scale, S/D-form
+// Store vector reg, register offset, extend, scale, Q-form
+def : InstRW<[V3Write_StrHQ, ReadAdrBase],
+ (instregex "^STR[BHSDQ]ro[WX]$")>;
+
+// Store vector pair, immed offset, S-form
+// Store vector pair, immed offset, D-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STN?P[SD]i$")>;
+
+// Store vector pair, immed offset, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01], (instrs STPQi, STNPQi)>;
+
+// Store vector pair, immed post-index, S-form
+// Store vector pair, immed post-index, D-form
+// Store vector pair, immed pre-index, S-form
+// Store vector pair, immed pre-index, D-form
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I],
+ (instregex "^STP[SD](pre|post)$")>;
+
+// Store vector pair, immed post-index, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01_1I], (instrs STPQpost)>;
+
+// Store vector pair, immed pre-index, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01_2I], (instrs STPQpre)>;
+
+// §3.16 ASIMD integer instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD absolute diff
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD arith, pair-wise
+// ASIMD compare
+// ASIMD logical
+// ASIMD max/min, basic and pair-wise
+def : SchedAlias<WriteVd, V3Write_2c_1V>;
+def : SchedAlias<WriteVq, V3Write_2c_1V>;
+
+// ASIMD absolute diff accum
+// ASIMD absolute diff accum long
+def : InstRW<[V3Wr_VA, V3Rd_VA], (instregex "^[SU]ABAL?v")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[V3Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[V3Write_5c_1V13_1V],
+ (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
+
+// ASIMD arith, reduce, 16B
+def : InstRW<[V3Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+
+// ASIMD dot product
+// ASIMD dot product using signed and unsigned integers
+def : InstRW<[V3Wr_VDOT, V3Rd_VDOT],
+ (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V3Wr_VMMA, V3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[V3Write_3c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+ "^[SU](MAX|MIN)Vv4i32v$")>;
+
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[V3Write_5c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+ "^[SU](MAX|MIN)Vv8i16v$")>;
+
+// ASIMD max/min, reduce, 16B
+def : InstRW<[V3Write_6c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+
+// ASIMD multiply
+def : InstRW<[V3Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
+// ASIMD multiply accumulate
+def : InstRW<[V3Wr_VMA, V3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
+// ASIMD multiply accumulate high
+def : InstRW<[V3Wr_VMAH, V3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3Wr_VMAL, V3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
+
+// ASIMD multiply/multiply long (8x8) polynomial, D-form
+// ASIMD multiply/multiply long (8x8) polynomial, Q-form
+def : InstRW<[V3Write_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>;
+
+// ASIMD multiply long
+def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>;
+
+// ASIMD pairwise add and accumulate long
+def : InstRW<[V3Wr_VPA, V3Rd_VPA], (instregex "^[SU]ADALPv")>;
+
+// ASIMD shift accumulate
+def : InstRW<[V3Wr_VSA, V3Rd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv",
+ "^SSHLLv", "^SSHR[dv]", "^USHLLv",
+ "^USHR[dv]")>;
+
+// ASIMD shift by immed and insert, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[V3Write_4c_1V],
+ (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$",
+ "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
+ "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]",
+ "^UQSHRN[bhsv]", "^URSHR[dv]")>;
+
+// ASIMD shift by register, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]SHLv")>;
+
+// ASIMD shift by register, complex
+def : InstRW<[V3Write_4c_1V],
+ (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
+ "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
+
+// §3.17 ASIMD floating-point instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD FP absolute value/difference
+// ASIMD FP arith, normal
+// ASIMD FP compare
+// ASIMD FP complex add
+// ASIMD FP max/min, normal
+// ASIMD FP max/min, pairwise
+// ASIMD FP negate
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD FP complex multiply add
+def : InstRW<[V3Wr_VFCMA, V3Rd_VFCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP convert, long (F16 to F32)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTL(v4|v8)i16")>;
+
+// ASIMD FP convert, long (F32 to F64)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTL(v2|v4)i32")>;
+
+// ASIMD FP convert, narrow (F32 to F16)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTN(v4|v8)i16")>;
+
+// ASIMD FP convert, narrow (F64 to F32)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTN(v2|v4)i32",
+ "^FCVTXN(v2|v4)f32")>;
+
+// ASIMD FP convert, other, D-form F32 and Q-form F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
+ "^FCVT[AMNPZ][SU]v2i(32|64)_shift$",
+ "^FCVT[AMNPZ][SU]v1i64$",
+ "^FCVTZ[SU]d$",
+ "^[SU]CVTFv2f(32|64)$",
+ "^[SU]CVTFv2i(32|64)_shift$",
+ "^[SU]CVTFv1i64$",
+ "^[SU]CVTFd$")>;
+
+// ASIMD FP convert, other, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
+ "^FCVT[AMNPZ][SU]v4i(16|32)_shift$",
+ "^FCVT[AMNPZ][SU]v1i32$",
+ "^FCVTZ[SU]s$",
+ "^[SU]CVTFv4f(16|32)$",
+ "^[SU]CVTFv4i(16|32)_shift$",
+ "^[SU]CVTFv1i32$",
+ "^[SU]CVTFs$")>;
+
+// ASIMD FP convert, other, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$",
+ "^FCVT[AMNPZ][SU]v8i16_shift$",
+ "^FCVT[AMNPZ][SU]v1f16$",
+ "^FCVTZ[SU]h$",
+ "^[SU]CVTFv8f16$",
+ "^[SU]CVTFv8i16_shift$",
+ "^[SU]CVTFv1i16$",
+ "^[SU]CVTFh$")>;
+
+// ASIMD FP divide, D-form, F16
+def : InstRW<[V3Write_9c_1V1_4rc], (instrs FDIVv4f16)>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[V3Write_9c_1V1_2rc], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instrs FDIVv8f16)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, reduce, F32 and D-form F16
+def : InstRW<[V3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
+
+// ASIMD FP max/min, reduce, Q-form F16
+def : InstRW<[V3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
+
+// ASIMD FP multiply
+def : InstRW<[V3Wr_VFM], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[V3Wr_VFMA, V3Rd_VFMA], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V3Wr_VFMAL, V3Rd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>;
+
+// ASIMD FP round, D-form F32 and Q-form F64
+def : InstRW<[V3Write_3c_1V02],
+ (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
+ "^FRINT(32|64)[XZ]v2f(32|64)$")>;
+
+// ASIMD FP round, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02],
+ (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
+ "^FRINT(32|64)[XZ]v4f32$")>;
+
+// ASIMD FP round, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
+
+// ASIMD FP square root, D-form, F16
+def : InstRW<[V3Write_9c_1V1_4rc], (instrs FSQRTv4f16)>;
+
+// ASIMD FP square root, D-form, F32
+def : InstRW<[V3Write_9c_1V1_2rc], (instrs FSQRTv2f32)>;
+
+// ASIMD FP square root, Q-form, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instrs FSQRTv8f16)>;
+
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instrs FSQRTv4f32)>;
+
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instrs FSQRTv2f64)>;
+
+// §3.18 ASIMD BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD convert, F32 to BF16
+def : InstRW<[V3Write_4c_2V02], (instrs BFCVTN, BFCVTN2)>;
+
+// ASIMD dot product
+def : InstRW<[V3Wr_VBFDOT, V3Rd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+
+// ASIMD matrix multiply accumulate
+def : InstRW<[V3Wr_VBFMMA, V3Rd_VBFMMA], (instrs BFMMLA)>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3Wr_VBFMAL, V3Rd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
+ BFMLALTIdx)>;
+
+// Scalar convert, F32 to BF16
+def : InstRW<[V3Write_3c_1V02], (instrs BFCVT)>;
+
+// §3.19 ASIMD miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD bit reverse
+// ASIMD bitwise insert
+// ASIMD count
+// ASIMD duplicate, element
+// ASIMD extract
+// ASIMD extract narrow
+// ASIMD insert, element to element
+// ASIMD move, FP immed
+// ASIMD move, integer immed
+// ASIMD reverse
+// ASIMD table lookup extension, 1 table reg
+// ASIMD transpose
+// ASIMD unzip/zip
+// Handled by SchedAlias<WriteV[dq], ...>
+def : InstRW<[V3Write_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>;
+
+// ASIMD duplicate, gen reg
+def : InstRW<[V3Write_3c_1M0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
+
+// ASIMD reciprocal and square root estimate, D-form U32
+def : InstRW<[V3Write_3c_1V02], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form U32
+def : InstRW<[V3Write_4c_2V02], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
+def : InstRW<[V3Write_3c_1V02], (instrs FRECPEv1f16, FRECPEv1i32,
+ FRECPEv1i64, FRECPEv2f32,
+ FRSQRTEv1f16, FRSQRTEv1i32,
+ FRSQRTEv1i64, FRSQRTEv2f32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02], (instrs FRECPEv4f16, FRECPEv4f32,
+ FRSQRTEv4f16, FRSQRTEv4f32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
+
+// ASIMD reciprocal exponent
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRECPXv")>;
+
+// ASIMD reciprocal step
+def : InstRW<[V3Write_4c_1V], (instregex "^FRECPS(32|64|v)",
+ "^FRSQRTS(32|64|v)")>;
+
+// ASIMD table lookup, 1 or 2 table regs
+def : InstRW<[V3Write_2c_1V], (instrs TBLv8i8One, TBLv16i8One,
+ TBLv8i8Two, TBLv16i8Two)>;
+
+// ASIMD table lookup, 3 table regs
+def : InstRW<[V3Write_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
+
+// ASIMD table lookup, 4 table regs
+def : InstRW<[V3Write_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>;
+
+// ASIMD table lookup extension, 2 table reg
+def : InstRW<[V3Write_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// ASIMD table lookup extension, 3 table reg
+def : InstRW<[V3Write_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>;
+
+// ASIMD table lookup extension, 4 table reg
+def : InstRW<[V3Write_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[V3Write_2c_2V01], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[V3Write_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
+
+// §3.20 ASIMD load instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1L],
+ (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1L],
+ (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2L],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2L],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3L],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3L],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4L],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4L],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 2 element, multiple, Q-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[V3Write_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[V3Write_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lane, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[V3Write_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[V3Write_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[V3Write_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+// §3.21 ASIMD store instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_3SA_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3Write_2c_4SA_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_4SA_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[V3Write_4c_2SA_4V01], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_5c_2SA_4V01], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[V3Write_6c_3SA_6V01], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3SA_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[V3Write_5c_2SA_4V01], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_6c_2SA_6V01], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2SA_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+def : InstRW<[V3Write_7c_4SA_12V01], (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4SA_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[V3Write_5c_4SA_8V01], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_4SA_8V01], (instregex "ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H/S
+def : InstRW<[V3Write_6c_1SA_3V01], (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1SA_3V01], (instregex "ST4i(8|16|32)_POST$")>;
+
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[V3Write_4c_2SA_4V01], (instregex "ST4i(64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01], (instregex "ST4i(64)_POST$")>;
+
+// §3.22 Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[V3Write_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[V3Write_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3Write_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>;
+
+// Crypto SM3 ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+ "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
+
+// §3.23 CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[V3Wr_CRC, V3Rd_CRC], (instregex "^CRC32")>;
+
+// §3.24 SVE Predicate instructions
+// -----------------------------------------------------------------------------
+
+// Loop control, based on predicate
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP,
+ BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP,
+ BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
+ BRKPBS_PPzPP)>;
+
+// Loop control, based on GPR
+def : InstRW<[V3Write_3c_2M],
+ (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
+def : InstRW<[V3Write_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+
+// Loop terminate
+def : InstRW<[V3Write_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
+
+// Predicate counting scalar
+def : InstRW<[V3Write_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+def : InstRW<[V3Write_2c_1M],
+ (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI",
+ "^SQ(DEC|INC)[BHWD]_XPiWdI",
+ "^UQ(DEC|INC)[BHWD]_WPiI")>;
+
+// Predicate counting scalar, ALL, {1,2,4}
+def : InstRW<[V3Write_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[V3Write_2c_1M],
+ (instregex "^CNTP_XPP_[BHSD]",
+ "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
+ "^(UQDEC|UQINC)P_WP_[BHSD]",
+ "^(SQDEC|SQINC)P_XPWd_[BHSD]")>;
+
+// Predicate counting vector, active predicate
+def : InstRW<[V3Write_7c_1M_1M0_1V],
+ (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
+
+// Predicate logical
+def : InstRW<[V3Write_1or2c_1M],
+ (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
+
+// Predicate logical, flag setting
+def : InstRW<[V3Write_1or2c_1M],
+ (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
+
+// Predicate reverse
+def : InstRW<[V3Write_2c_1M], (instregex "^REV_PP_[BHSD]")>;
+
+// Predicate select
+def : InstRW<[V3Write_1c_1M], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[V3Write_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[V3Write_2c_1M], (instregex "^PTRUES_[BHSD]")>;
+
+// Predicate find first/next
+def : InstRW<[V3Write_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+
+// Predicate test
+def : InstRW<[V3Write_1c_1M], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[V3Write_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>;
+
+// Predicate unpack and widen
+def : InstRW<[V3Write_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[V3Write_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>;
+
+// §3.25 SVE integer instructions
+// -----------------------------------------------------------------------------
+
+// Arithmetic, absolute diff
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
+ "^[SU]ABD_ZPZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, basic
+def : InstRW<[V3Write_2c_1V],
+ (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^(ADD|SUB)_ZZZ_[BHSD]",
+ "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
+ "^(ADD|SUB|SUBR)_ZI_[BHSD]",
+ "^ADR_[SU]XTW_ZZZ_D_[0123]",
+ "^ADR_LSL_ZZZ_[SD]_[0123]",
+ "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+ "^SADDLBT_ZZZ_[HSD]",
+ "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
+
+// Arithmetic, complex
+def : InstRW<[V3Write_2c_1V],
+ (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
+ "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
+ "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
+ "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
+ "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, large integer
+def : InstRW<[V3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[V3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[V3Wr_ZPA, ReadDefault, V3Rd_ZPA],
+ (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
+
+// Arithmetic, shift
+def : InstRW<[V3Write_2c_1V13],
+ (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
+ "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
+ "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
+ "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
+ "^(ASR|LSL|LSR)_ZZI_[BHSD]",
+ "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
+ "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[V3Wr_ZSA, V3Rd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>;
+
+// Arithmetic, shift by immediate
+def : InstRW<[V3Write_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]",
+ "^[SU]SHLL[BT]_ZZI_[HSD]")>;
+
+// Arithmetic, shift by immediate and insert
+def : InstRW<[V3Write_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>;
+
+// Arithmetic, shift complex
+def : InstRW<[V3Write_4c_1V],
+ (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
+ "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]",
+ "^[SU]QR?SHL_ZPZZ_[BHSD]",
+ "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
+ "^SQSHRU?N[BT]_ZZI_[BHS]",
+ "^UQR?SHRN[BT]_ZZI_[BHS]")>;
+
+// Arithmetic, shift right for divide
+def : InstRW<[V3Write_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]",
+ "^[SU]RSHL_ZPZZ_[BHSD]",
+ "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Bit manipulation
+def : InstRW<[V3Write_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>;
+
+// Bitwise select
+def : InstRW<[V3Write_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
+
+// Count/reverse bits
+def : InstRW<[V3Write_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
+
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[V3Write_2c_1V], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[V3Write_2or3c_1V0],
+ (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
+ "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
+
+// Complex add
+def : InstRW<[V3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[V3Wr_ZCMABHS, V3Rd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]",
+ "^CMLA_ZZZI_[HS]")>;
+
+// Complex multiply-add D element size
+def : InstRW<[V3Wr_ZCMAD, V3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[V3Write_8c_1M0_1V01], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[V3Write_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
+ "^COMPACT_ZPZ_[SD]",
+ "^SPLICE_ZPZZ?_[BHSD]")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
+ "^[SU]CVTF_ZPmZ_StoD")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[V3Write_4c_2V02], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[V3Write_6c_4V02], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
+
+// Copy, scalar
+def : InstRW<[V3Write_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[V3Write_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]",
+ "^CPY_ZPzI_[BHSD]")>;
+
+// Divides, 32 bit
+def : InstRW<[V3Write_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
+ "^[SU]DIV_ZPZZ_S")>;
+
+// Divides, 64 bit
+def : InstRW<[V3Write_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
+ "^[SU]DIV_ZPZZ_D")>;
+
+// Dot product, 8 bit
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[V3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]",
+ "^DUP_ZZI_[BHSDQ]")>;
+
+// Duplicate, scalar form
+def : InstRW<[V3Write_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>;
+
+// Extend, sign or zero
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
+ "^[SU]XTH_ZPmZ_[SD]",
+ "^[SU]XTW_ZPmZ_[D]")>;
+
+// Extract
+def : InstRW<[V3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
+ "^SQXTUN[BT]_ZZ_[BHS]")>;
+
+// Extract operation, SIMD and FP scalar form
+def : InstRW<[V3Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>;
+
+// Extract operation, scalar
+def : InstRW<[V3Write_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>;
+
+// Histogram operations
+def : InstRW<[V3Write_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]",
+ "^HISTSEG_ZZZ")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[V3Write_4c_1V02], (instregex "^INDEX_II_[BHS]")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[V3Write_7c_1M0_1V02], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[V3Write_5c_2V02], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[V3Write_8c_2M0_2V02], (instregex "^INDEX_(IR|RI|RR)_D")>;
+
+// insert operation, SIMD and FP scalar form
+def : InstRW<[V3Write_2c_1V], (instregex "^INSR_ZV_[BHSD]")>;
+
+// insert operation, scalar
+def : InstRW<[V3Write_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>;
+
+// Logical
+def : InstRW<[V3Write_2c_1V],
+ (instregex "^(AND|EOR|ORR)_ZI",
+ "^(AND|BIC|EOR|ORR)_ZZZ",
+ "^EOR(BT|TB)_ZZZ_[BHSD]",
+ "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]",
+ "^NOT_ZPmZ_[BHSD]")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
+ "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]",
+ "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>;
+
+// Matching operations
+// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the
+// latency for this instruction is 4 cycles.
+def : InstRW<[V3Write_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[V3Wr_ZMMA, V3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[V3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
+ "^MOVPRFX_ZZ")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
+ "^MUL_ZPZZ_[BHS]",
+ "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
+ "^[SU]MULH_ZPZZ_[BHS]")>;
+
+// Multiply, D element size
+def : InstRW<[V3Write_5c_2V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
+ "^MUL_ZPZZ_D",
+ "^[SU]MULH_(ZPmZ|ZZZ)_D",
+ "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply long
+def : InstRW<[V3Write_4c_1V02], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
+ "^[SU]MULL[BT]_ZZZ_[HSD]")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[V3Wr_ZMABHS, V3Rd_ZMABHS],
+ (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>;
+def : InstRW<[V3Wr_ZMABHS, ReadDefault, V3Rd_ZMABHS],
+ (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V3Wr_ZMAD, V3Rd_ZMAD],
+ (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V3Wr_ZMAD, ReadDefault, V3Rd_ZMAD],
+ (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
+
+// Multiply accumulate long
+def : InstRW<[V3Wr_ZMAL, V3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
+ "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[V3Wr_ZMASQL, V3Rd_ZMASQ],
+ (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]",
+ "^SQDML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULH_ZZZ_[BHS]",
+ "^SQDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[V3Write_5c_2V02], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
+ "^SQDMULL[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[V3Wr_ZMASQBHS, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
+ "^SQRDCMLAH_ZZZ_[BHS]",
+ "^SQRDML[AS]H_ZZZI_[HS]",
+ "^SQRDCMLAH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[V3Wr_ZMASQD, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D",
+ "^SQRDCMLAH_ZZZ_D")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQRDMULH_ZZZ_[BHS]",
+ "^SQRDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[V3Write_5c_2V02], (instregex "^SQRDMULH_ZZZI?_D")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[V3Write_2c_1V], (instregex "^PMUL_ZZZ_B",
+ "^PMULL[BT]_ZZZ_[HDQ]")>;
+
+// Predicate counting vector
+def : InstRW<[V3Write_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>;
+
+// Reciprocal estimate
+def : InstRW<[V3Write_4c_2V02], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[V3Write_9c_2V_4V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[V3Write_8c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[V3Write_6c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[V3Write_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[V3Write_6c_1V_1V13], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>;
+
+// Reverse, vector
+def : InstRW<[V3Write_2c_1V], (instregex "^REV_ZZ_[BHSD]",
+ "^REVB_ZPmZ_[HSD]",
+ "^REVH_ZPmZ_[SD]",
+ "^REVW_ZPmZ_D")>;
+
+// Select, vector form
+def : InstRW<[V3Write_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>;
+
+// Table lookup
+def : InstRW<[V3Write_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>;
+
+// Table lookup extension
+def : InstRW<[V3Write_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>;
+
+// Transpose, vector form
+def : InstRW<[V3Write_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+
+// Unpack and extend
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
+
+// Zip/unzip
+def : InstRW<[V3Write_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+
+// §3.26 SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[V3Write_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
+ "^FABD_ZPZZ_[HSD]",
+ "^FABS_ZPmZ_[HSD]")>;
+
+// Floating point arithmetic
+def : InstRW<[V3Write_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
+ "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
+ "^FADDP_ZPmZZ_[HSD]",
+ "^FNEG_ZPmZ_[HSD]",
+ "^FSUBR_ZPm[IZ]_[HSD]",
+ "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
+
+// Floating point associative add, F16
+def : InstRW<[V3Write_10c_1V1_9rc], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[V3Write_6c_1V1_5rc], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[V3Write_4c_1V], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[V3Write_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+ "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
+ "^FCM(LE|LT)_PPzZ0_[HSD]",
+ "^FCMUO_PPzZZ_[HSD]")>;
+
+// Floating point complex add
+def : InstRW<[V3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>;
+
+// Floating point complex multiply add
+def : InstRW<[V3Wr_ZFCMA, ReadDefault, V3Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V3Wr_ZFCMA, V3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
+ "^FCVTLT_ZPmZ_HtoS",
+ "^FCVTNT_ZPmZ_StoH")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
+ "^FCVTLT_ZPmZ_StoD",
+ "^FCVTNT_ZPmZ_DtoS")>;
+
+// Floating point convert, round to odd
+def : InstRW<[V3Write_3c_1V02], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
+
+// Floating point base2 log, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point base2 log, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point base2 log, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point convert to integer, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[V3Write_3c_1V02],
+ (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
+
+// Floating point copy
+def : InstRW<[V3Write_2c_1V], (instregex "^FCPY_ZPmI_[HSD]",
+ "^FDUP_ZI_[HSD]")>;
+
+// Floating point divide, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point divide, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point divide, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point min/max pairwise
+def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+
+// Floating point min/max
+def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
+ "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply
+def : InstRW<[V3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
+ "^FMULX_ZPZZ_[HSD]",
+ "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
+ "^FMUL_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply accumulate
+def : InstRW<[V3Wr_ZFMA, ReadDefault, V3Rd_ZFMA],
+ (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+ "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V3Wr_ZFMA, V3Rd_ZFMA],
+ (instregex "^FML[AS]_ZZZI_[HSD]",
+ "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[V3Wr_ZFMAL, V3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>;
+
+// Floating point reciprocal estimate, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>;
+
+// Floating point reciprocal step
+def : InstRW<[V3Write_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
+
+// Floating point reduction, F16
+def : InstRW<[V3Write_8c_4V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>;
+
+// Floating point reduction, F32
+def : InstRW<[V3Write_6c_3V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>;
+
+// Floating point reduction, F64
+def : InstRW<[V3Write_4c_2V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>;
+
+// Floating point round to integral, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
+
+// Floating point round to integral, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
+
+// Floating point round to integral, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
+
+// Floating point square root, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>;
+
+// Floating point square root, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>;
+
+// Floating point square root, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[V3Write_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[V3Write_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[V3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>;
+
+// §3.27 SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[V3Write_4c_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[V3Wr_ZBFDOT, V3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[V3Wr_ZBFMMA, V3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
+
+// Multiply accumulate long
+def : InstRW<[V3Wr_ZBFMAL, V3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>;
+
+// §3.28 SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[V3Write_6c_1L], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[V3Write_6c_1L_1M], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]$",
+ "^LD1S?B_[HSD]$",
+ "^LD1S?H_[SD]$",
+ "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1R[BHWD]_IMM$",
+ "^LD1RS?B_[HSD]_IMM$",
+ "^LD1RS?H_[SD]_IMM$",
+ "^LD1RW_D_IMM$",
+ "^LD1RSW_IMM$",
+ "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+// Non temporal load, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[V3Write_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$",
+ "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[V3Write_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[V3Write_9c_2L_2V], (instrs LDNT1D_ZZR_D)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[V3Write_6c_1L_1I], (instregex "^LDFF1[BHWD]$",
+ "^LDFF1S?B_[HSD]$",
+ "^LDFF1S?H_[SD]$",
+ "^LDFF1S?W_D$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LDNF1[BHWD]_IMM$",
+ "^LDNF1S?B_[HSD]_IMM$",
+ "^LDNF1S?H_[SD]_IMM$",
+ "^LDNF1S?W_D_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[V3Write_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[V3Write_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[V3Write_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[V3Write_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[V3Write_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[V3Write_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
+ "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+ "^GLD(FF)?1D_IMM$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[V3Write_10c_1L_8V],
+ (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$",
+ "^GLD(FF)?1W_[SU]XTW_SCALED")>;
+
+// Gather load, 64-bit scaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3Write_10c_1L_4V],
+ (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$",
+ "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+ "^GLD(FF)?1W_[SU]XTW$")>;
+
+// Gather load, 64-bit unpacked unscaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3Write_9c_1L_2V],
+ (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$",
+ "^GLD(FF)?1D(_[SU]XTW)?$")>;
+
+// §3.29 SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[V3Write_1c_1SA], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[V3Write_2c_1SA_1V01], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BHWD]_IMM$",
+ "^ST1B_[HSD]_IMM$",
+ "^ST1H_[SD]_IMM$",
+ "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[V3Write_2c_1SA_1I_1V01], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BWD]$",
+ "^ST1B_[HSD]$",
+ "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[V3Write_4c_1SA_1V01], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[V3Write_4c_2SA_2I_2V01], (instrs ST2H)>;
+def : InstRW<[V3Write_4c_2SA_2V01], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[V3Write_7c_9SA_9V01], (instregex "^ST3[BHWD]_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[V3Write_7c_9SA_9I_9V01], (instregex "^ST3[BHWD]$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[V3Write_11c_18SA_18V01], (instregex "^ST4[BHWD]_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[V3Write_11c_18SA_18I_18V01], (instregex "^ST4[BHWD]$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[V3Write_2c_1SA_1I_1V01], (instrs STNT1H_ZRR)>;
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_IMM$",
+ "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_IMM$",
+ "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[V3Write_4c_6SA_6V01],
+ (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_[SU]XTW$",
+ "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+ "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_[SU]XTW$",
+ "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_SCALED$",
+ "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D$",
+ "^SST1D$")>;
+
+// §3.30 SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[V3Write_2c_1M0], (instrs RDFFR_P)>;
+
+// Read first fault register, predicated
+def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFR_PPz)>;
+
+// Read first fault register and set flags
+def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[V3Write_2c_1M0], (instrs SETFFR, WRFFR)>;
+
+// Prefetch
+// NOTE: This is not specified in the SOG.
+def : InstRW<[V3Write_4c_1L], (instregex "^PRF[BHWD]")>;
+
+// §3.31 SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]_ZZZ_B$",
+ "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3Write_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$",
+ "^RAX1_ZZZ_D$",
+ "^XAR_ZZZI_[BHSD]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3Write_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td
new file mode 100644
index 0000000..0f1ec66
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td
@@ -0,0 +1,2705 @@
+//=- AArch64SchedNeoverseV3AE.td - NeoverseV3AE Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse V3AE processors.
+// All information is taken from the V3AE Software Optimisation guide:
+//
+// https://developer.arm.com/documentation/109703/300/?lang=en
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseV3AEModel : SchedMachineModel {
+ let IssueWidth = 10; // Expect best value to be slightly higher than V2
+ let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
+ let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+ [HasSVE2p1, HasSVEB16B16,
+ HasCPA, HasCSSC]);
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse V3AE.
+// Instructions are first fetched and then decoded into internal macro-ops
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch
+// stages. A MOP can be split into two micro-ops further down the pipeline
+// after the decode stage. Once dispatched, micro-ops wait for their operands
+// and issue out-of-order to one of nineteen issue pipelines. Each issue
+// pipeline can accept one micro-op per cycle.
+
+let SchedModel = NeoverseV3AEModel in {
+
+// Define the (19) issue ports.
+def V3AEUnitB : ProcResource<3>; // Branch 0/1/2
+def V3AEUnitS0 : ProcResource<1>; // Integer single-cycle 0
+def V3AEUnitS1 : ProcResource<1>; // Integer single-cycle 1
+def V3AEUnitS2 : ProcResource<1>; // Integer single-cycle 2
+def V3AEUnitS3 : ProcResource<1>; // Integer single-cycle 3
+def V3AEUnitS4 : ProcResource<1>; // Integer single-cycle 4
+def V3AEUnitS5 : ProcResource<1>; // Integer single-cycle 5
+def V3AEUnitM0 : ProcResource<1>; // Integer single/multicycle 0
+def V3AEUnitM1 : ProcResource<1>; // Integer single/multicycle 1
+def V3AEUnitV0 : ProcResource<1>; // FP/ASIMD 0
+def V3AEUnitV1 : ProcResource<1>; // FP/ASIMD 1
+def V3AEUnitLS0 : ProcResource<1>; // Load/Store 0
+def V3AEUnitL12 : ProcResource<2>; // Load 1/2
+def V3AEUnitST1 : ProcResource<1>; // Store 1
+def V3AEUnitD : ProcResource<2>; // Store data 0/1
+def V3AEUnitFlg : ProcResource<4>; // Flags
+
+def V3AEUnitS : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5]>; // Integer single-cycle 0/1/2/3/4/5
+def V3AEUnitI : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5, V3AEUnitM0, V3AEUnitM1]>; // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1
+def V3AEUnitM : ProcResGroup<[V3AEUnitM0, V3AEUnitM1]>; // Integer single/multicycle 0/1
+def V3AEUnitLSA : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12, V3AEUnitST1]>; // Supergroup of L+SA
+def V3AEUnitL : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12]>; // Load/Store 0 and Load 1/2
+def V3AEUnitSA : ProcResGroup<[V3AEUnitLS0, V3AEUnitST1]>; // Load/Store 0 and Store 1
+def V3AEUnitV : ProcResGroup<[V3AEUnitV0, V3AEUnitV1]>; // FP/ASIMD 0/1
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// NOTE: Copied from N2.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse V3AE.
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 0 micro-op types
+def V3AEWrite_0c : SchedWriteRes<[]> { let Latency = 0; }
+
+// Define generic 1 micro-op types
+
+def V3AEWrite_1c_1B : SchedWriteRes<[V3AEUnitB]> { let Latency = 1; }
+def V3AEWrite_1c_1F_1Flg : SchedWriteRes<[V3AEUnitI, V3AEUnitFlg]> { let Latency = 1; }
+def V3AEWrite_1c_1I : SchedWriteRes<[V3AEUnitI]> { let Latency = 1; }
+def V3AEWrite_1c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 1; }
+def V3AEWrite_1c_1SA : SchedWriteRes<[V3AEUnitSA]> { let Latency = 1; }
+def V3AEWrite_2c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 2; }
+def V3AEWrite_2c_1M_1Flg : SchedWriteRes<[V3AEUnitM, V3AEUnitFlg]> { let Latency = 2; }
+def V3AEWrite_3c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 3; }
+def V3AEWrite_2c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 2; }
+def V3AEWrite_3c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 3; }
+def V3AEWrite_4c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 4; }
+def V3AEWrite_12c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 12;
+ let ReleaseAtCycles = [12]; }
+def V3AEWrite_20c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 20;
+ let ReleaseAtCycles = [20]; }
+def V3AEWrite_4c_1L : SchedWriteRes<[V3AEUnitL]> { let Latency = 4; }
+def V3AEWrite_6c_1L : SchedWriteRes<[V3AEUnitL]> { let Latency = 6; }
+def V3AEWrite_2c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 2; }
+def V3AEWrite_2c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 2; }
+def V3AEWrite_3c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AEWrite_4c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AEWrite_5c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AEWrite_6c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; }
+def V3AEWrite_12c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 12; }
+def V3AEWrite_3c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; }
+def V3AEWrite_4c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AEWrite_9c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 9; }
+def V3AEWrite_10c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 10; }
+def V3AEWrite_8c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 8; }
+def V3AEWrite_12c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 12;
+ let ReleaseAtCycles = [11]; }
+def V3AEWrite_13c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 13; }
+def V3AEWrite_15c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 15; }
+def V3AEWrite_13c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13;
+ let ReleaseAtCycles = [8]; }
+def V3AEWrite_16c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 16; }
+def V3AEWrite_20c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 20;
+ let ReleaseAtCycles = [20]; }
+def V3AEWrite_2c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 2; }
+def V3AEWrite_3c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 3; }
+def V3AEWrite_4c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 4; }
+def V3AEWrite_6c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 6; }
+def V3AEWrite_10c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 10; }
+def V3AEWrite_6c_1SA : SchedWriteRes<[V3AEUnitSA]> { let Latency = 6; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def V3AEWrite_1c_1B_1S : SchedWriteRes<[V3AEUnitB, V3AEUnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1M0_1B : SchedWriteRes<[V3AEUnitM0, V3AEUnitB]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_9c_1M0_1L : SchedWriteRes<[V3AEUnitM0, V3AEUnitL]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1I_1M : SchedWriteRes<[V3AEUnitI, V3AEUnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_7c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_1SA_1D : SchedWriteRes<[V3AEUnitSA, V3AEUnitD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1V1_1V : SchedWriteRes<[V3AEUnitV1, V3AEUnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2L : SchedWriteRes<[V3AEUnitL, V3AEUnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_8c_1L_1V : SchedWriteRes<[V3AEUnitL, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2V1 : SchedWriteRes<[V3AEUnitV1, V3AEUnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_7c_1M0_1V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitV0]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1V_1V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1L_1M : SchedWriteRes<[V3AEUnitL, V3AEUnitM]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def V3AEWrite_8c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def V3AEWrite_1c_1SA_1D_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitD, V3AEUnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_1V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_4c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_9c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_4c_3V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_7c_1M_1M0_1V : SchedWriteRes<[V3AEUnitM, V3AEUnitM0, V3AEUnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_1I_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitI, V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_6c_3L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_6c_3V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def V3AEWrite_8c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def V3AEWrite_2c_1SA_2V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_5c_1I_3L : SchedWriteRes<[V3AEUnitI, V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_4V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0, V3AEUnitV0, V3AEUnitV0]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+ V3AEUnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_9c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_2c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_4c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2M0_2V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitV0,
+ V3AEUnitV0]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+ V3AEUnitV1]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_4c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM,
+ V3AEUnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_5c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM,
+ V3AEUnitM]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_2I_2L : SchedWriteRes<[V3AEUnitI, V3AEUnitI, V3AEUnitL, V3AEUnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_7c_4L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_1SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def V3AEWrite_2c_1SA_2V_2I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitI, V3AEUnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+}
+
+def V3AEWrite_8c_2L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+def V3AEWrite_9c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+
+def V3AEWrite_10c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 5;
+}
+
+def V3AEWrite_6c_5V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def V3AEWrite_8c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2L_2V_2I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV,
+ V3AEUnitV, V3AEUnitI, V3AEUnitI]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2V_4V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+ V3AEUnitV1, V3AEUnitV1, V3AEUnitV1]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_2c_3SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_4c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_5c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def V3AEWrite_4c_2SA_2I_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitI,
+ V3AEUnitI, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def V3AEWrite_8c_3L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def V3AEWrite_2c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 8;
+}
+
+def V3AEWrite_4c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 8;
+}
+
+def V3AEWrite_6c_2SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+}
+
+def V3AEWrite_8c_4L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def V3AEWrite_6c_3SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 9;
+}
+
+def V3AEWrite_10c_1L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 9;
+}
+
+def V3AEWrite_10c_3V_3L_3I : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI]> {
+ let Latency = 10;
+ let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def V3AEWrite_9c_6L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def V3AEWrite_5c_4SA_8V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 12;
+}
+
+def V3AEWrite_9c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitL, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 12;
+}
+
+def V3AEWrite_10c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitL, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def V3AEWrite_7c_4SA_12V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 16;
+}
+
+def V3AEWrite_10c_4L_8V_4I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+ V3AEUnitL, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI]> {
+ let Latency = 10;
+ let NumMicroOps = 16;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def V3AEWrite_7c_9SA_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 18;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 27 micro-op types
+
+def V3AEWrite_7c_9SA_9I_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 27;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 36 micro-op types
+
+def V3AEWrite_11c_18SA_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 36;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 54 micro-op types
+
+def V3AEWrite_11c_18SA_18I_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitSA, V3AEUnitSA,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitI, V3AEUnitI, V3AEUnitI,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV, V3AEUnitV,
+ V3AEUnitV, V3AEUnitV,
+ V3AEUnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 54;
+}
+
+//===----------------------------------------------------------------------===//
+// Define predicate-controlled types
+
+def V3AEWrite_ArithI : SchedWriteVariant<[
+ SchedVar<IsCheapLSL, [V3AEWrite_1c_1I]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>;
+
+def V3AEWrite_ArithF : SchedWriteVariant<[
+ SchedVar<IsCheapLSL, [V3AEWrite_1c_1F_1Flg]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1M_1Flg]>]>;
+
+def V3AEWrite_Logical : SchedWriteVariant<[
+ SchedVar<NeoverseNoLSL, [V3AEWrite_1c_1F_1Flg]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1M_1Flg]>]>;
+
+def V3AEWrite_Extr : SchedWriteVariant<[
+ SchedVar<IsRORImmIdiomPred, [V3AEWrite_1c_1I]>,
+ SchedVar<NoSchedPred, [V3AEWrite_3c_1I_1M]>]>;
+
+def V3AEWrite_LdrHQ : SchedWriteVariant<[
+ SchedVar<NeoverseHQForm, [V3AEWrite_7c_1I_1L]>,
+ SchedVar<NoSchedPred, [V3AEWrite_6c_1L]>]>;
+
+def V3AEWrite_StrHQ : SchedWriteVariant<[
+ SchedVar<NeoverseHQForm, [V3AEWrite_2c_1SA_1V_1I]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1SA_1V]>]>;
+
+def V3AEWrite_0or1c_1I : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+ SchedVar<NoSchedPred, [V3AEWrite_1c_1I]>]>;
+
+def V3AEWrite_0or2c_1V : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1V]>]>;
+
+def V3AEWrite_0or3c_1M0 : SchedWriteVariant<[
+ SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+ SchedVar<NoSchedPred, [V3AEWrite_3c_1M0]>]>;
+
+def V3AEWrite_2or3c_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1M]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>;
+
+def V3AEWrite_1or2c_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3AEWrite_2c_1M]>,
+ SchedVar<NoSchedPred, [V3AEWrite_1c_1M]>]>;
+
+def V3AEWrite_3or4c_1M0_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3AEWrite_4c_1M0_1M]>,
+ SchedVar<NoSchedPred, [V3AEWrite_3c_1M0_1M]>]>;
+
+def V3AEWrite_2or3c_1V0 : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1V0]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1V0]>]>;
+
+def V3AEWrite_2or3c_1V0_1M : SchedWriteVariant<[
+ SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1V0_1M]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1V0_1M]>]>;
+
+def V3AEWrite_IncDec : SchedWriteVariant<[
+ SchedVar<NeoverseCheapIncDec, [V3AEWrite_1c_1I]>,
+ SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>;
+
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V3AEWr_IM : SchedWriteRes<[V3AEUnitM]> { let Latency = 2; }
+
+def V3AEWr_FMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_FMA : SchedReadAdvance<2, [WriteFMul, V3AEWr_FMA]>;
+
+def V3AEWr_VA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VA : SchedReadAdvance<3, [V3AEWr_VA]>;
+
+def V3AEWr_VDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_VDOT : SchedReadAdvance<2, [V3AEWr_VDOT]>;
+
+def V3AEWr_VMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_VMMA : SchedReadAdvance<2, [V3AEWr_VMMA]>;
+
+def V3AEWr_VMA : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMA : SchedReadAdvance<3, [V3AEWr_VMA]>;
+
+def V3AEWr_VMAH : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMAH : SchedReadAdvance<2, [V3AEWr_VMAH]>;
+
+def V3AEWr_VMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMAL : SchedReadAdvance<3, [V3AEWr_VMAL]>;
+
+def V3AEWr_VPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VPA : SchedReadAdvance<3, [V3AEWr_VPA]>;
+
+def V3AEWr_VSA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VSA : SchedReadAdvance<3, [V3AEWr_VSA]>;
+
+def V3AEWr_VFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFCMA : SchedReadAdvance<2, [V3AEWr_VFCMA]>;
+
+def V3AEWr_VFM : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AEWr_VFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFMA : SchedReadAdvance<2, [V3AEWr_VFM, V3AEWr_VFMA]>;
+
+def V3AEWr_VFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFMAL : SchedReadAdvance<2, [V3AEWr_VFMAL]>;
+
+def V3AEWr_VBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_VBFDOT : SchedReadAdvance<2, [V3AEWr_VBFDOT]>;
+def V3AEWr_VBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; }
+def V3AERd_VBFMMA : SchedReadAdvance<2, [V3AEWr_VBFMMA]>;
+def V3AEWr_VBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_VBFMAL : SchedReadAdvance<3, [V3AEWr_VBFMAL]>;
+
+def V3AEWr_CRC : SchedWriteRes<[V3AEUnitM0]> { let Latency = 2; }
+def V3AERd_CRC : SchedReadAdvance<1, [V3AEWr_CRC]>;
+
+def V3AEWr_ZA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZA : SchedReadAdvance<3, [V3AEWr_ZA]>;
+def V3AEWr_ZPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZPA : SchedReadAdvance<3, [V3AEWr_ZPA]>;
+def V3AEWr_ZSA : SchedWriteRes<[V3AEUnitV1]> { let Latency = 4; }
+def V3AERd_ZSA : SchedReadAdvance<3, [V3AEWr_ZSA]>;
+
+def V3AEWr_ZDOTB : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_ZDOTB : SchedReadAdvance<2, [V3AEWr_ZDOTB]>;
+def V3AEWr_ZDOTH : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; }
+def V3AERd_ZDOTH : SchedReadAdvance<2, [V3AEWr_ZDOTH]>;
+
+// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
+// throughput to 1 in case of forwarding?
+def V3AEWr_ZCMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZCMABHS : SchedReadAdvance<3, [V3AEWr_ZCMABHS]>;
+def V3AEWr_ZCMAD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZCMAD : SchedReadAdvance<2, [V3AEWr_ZCMAD]>;
+
+def V3AEWr_ZMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_ZMMA : SchedReadAdvance<2, [V3AEWr_ZMMA]>;
+
+def V3AEWr_ZMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZMABHS : SchedReadAdvance<3, [V3AEWr_ZMABHS]>;
+def V3AEWr_ZMAD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZMAD : SchedReadAdvance<2, [V3AEWr_ZMAD]>;
+
+def V3AEWr_ZMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZMAL : SchedReadAdvance<3, [V3AEWr_ZMAL]>;
+
+def V3AEWr_ZMASQL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AEWr_ZMASQBHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AEWr_ZMASQD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZMASQ : SchedReadAdvance<2, [V3AEWr_ZMASQL, V3AEWr_ZMASQBHS,
+ V3AEWr_ZMASQD]>;
+
+def V3AEWr_ZFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZFCMA : SchedReadAdvance<3, [V3AEWr_ZFCMA]>;
+
+def V3AEWr_ZFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZFMA : SchedReadAdvance<2, [V3AEWr_ZFMA]>;
+
+def V3AEWr_ZFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZFMAL : SchedReadAdvance<2, [V3AEWr_ZFMAL]>;
+
+def V3AEWr_ZBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZBFDOT : SchedReadAdvance<2, [V3AEWr_ZBFDOT]>;
+def V3AEWr_ZBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; }
+def V3AERd_ZBFMMA : SchedReadAdvance<2, [V3AEWr_ZBFMMA]>;
+def V3AEWr_ZBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZBFMAL : SchedReadAdvance<3, [V3AEWr_ZBFMAL]>;
+
+//===----------------------------------------------------------------------===//
+// Define types with long resource cycles (rc)
+
+def V3AEWrite_6c_1V1_5rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 6; let ReleaseAtCycles = [ 5]; }
+def V3AEWrite_9c_1V1_2rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 2]; }
+def V3AEWrite_9c_1V1_4rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 4]; }
+def V3AEWrite_10c_1V1_9rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 10; let ReleaseAtCycles = [ 9]; }
+def V3AEWrite_11c_1V1_4rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; }
+def V3AEWrite_13c_1V1_8rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; }
+def V3AEWrite_14c_1V1_2rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; }
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// §3.3 Branch instructions
+// -----------------------------------------------------------------------------
+
+// Branch, immed
+// Compare and branch
+def : SchedAlias<WriteBr, V3AEWrite_1c_1B>;
+
+// Branch, register
+def : SchedAlias<WriteBrReg, V3AEWrite_1c_1B>;
+
+// Branch and link, immed
+// Branch and link, register
+def : InstRW<[V3AEWrite_1c_1B_1S], (instrs BL, BLR)>;
+
+// §3.4 Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+// ALU, basic
+def : SchedAlias<WriteI, V3AEWrite_1c_1I>;
+
+// ALU, basic, flagset
+def : InstRW<[V3AEWrite_1c_1F_1Flg],
+ (instregex "^(ADD|SUB)S[WX]r[ir]$",
+ "^(ADC|SBC)S[WX]r$",
+ "^ANDS[WX]ri$",
+ "^(AND|BIC)S[WX]rr$")>;
+def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^MOVZ[WX]i$")>;
+
+// ALU, extend and shift
+def : SchedAlias<WriteIEReg, V3AEWrite_2c_1M>;
+
+// Arithmetic, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
+def : SchedAlias<WriteISReg, V3AEWrite_ArithI>;
+def : InstRW<[V3AEWrite_ArithF],
+ (instregex "^(ADD|SUB)S[WX]rs$")>;
+
+// Arithmetic, immediate to logical address tag
+def : InstRW<[V3AEWrite_2c_1M], (instrs ADDG, SUBG)>;
+
+// Conditional compare
+def : InstRW<[V3AEWrite_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>;
+
+// Convert floating-point condition flags
+// Flag manipulation instructions
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+// Insert Random Tags
+def : InstRW<[V3AEWrite_2c_1M], (instrs IRG, IRGstack)>;
+
+// Insert Tag Mask
+// Subtract Pointer
+def : InstRW<[V3AEWrite_1c_1I], (instrs GMI, SUBP)>;
+
+// Subtract Pointer, flagset
+def : InstRW<[V3AEWrite_1c_1F_1Flg], (instrs SUBPS)>;
+
+// Logical, shift, no flagset
+def : InstRW<[V3AEWrite_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
+def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^ORR[WX]rs$")>;
+
+// Logical, shift, flagset
+def : InstRW<[V3AEWrite_Logical], (instregex "^(AND|BIC)S[WX]rs$")>;
+
+// Move and shift instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteImm, V3AEWrite_1c_1I>;
+
+// §3.5 Divide and multiply instructions
+// -----------------------------------------------------------------------------
+
+// SDIV, UDIV
+def : SchedAlias<WriteID32, V3AEWrite_12c_1M0>;
+def : SchedAlias<WriteID64, V3AEWrite_20c_1M0>;
+
+def : SchedAlias<WriteIM32, V3AEWrite_2c_1M>;
+def : SchedAlias<WriteIM64, V3AEWrite_2c_1M>;
+
+// Multiply
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+def : InstRW<[V3AEWr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V3AEWr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
+// Multiply high
+def : InstRW<[V3AEWrite_3c_1M], (instrs SMULHrr, UMULHrr)>;
+
+// §3.6 Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[V3AEWrite_4c_1M0], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[V3AEWrite_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+ BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+ ERETAA, ERETAB)>;
+
+
+// Load register, with pointer authentication
+def : InstRW<[V3AEWrite_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[V3AEWrite_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>;
+
+// §3.7 Miscellaneous data-processing instructions
+// -----------------------------------------------------------------------------
+
+// Address generation
+def : InstRW<[V3AEWrite_1c_1I], (instrs ADR, ADRP)>;
+
+// Bitfield extract, one reg
+// Bitfield extract, two regs
+def : SchedAlias<WriteExtr, V3AEWrite_Extr>;
+def : InstRW<[V3AEWrite_Extr], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitfield move, basic
+def : SchedAlias<WriteIS, V3AEWrite_1c_1I>;
+
+// Bitfield move, insert
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^BFM[WX]ri$")>;
+
+// §3.8 Load instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3.
+
+def : SchedAlias<WriteLD, V3AEWrite_4c_1L>;
+def : SchedAlias<WriteLDIdx, V3AEWrite_4c_1L>;
+
+// Load register, literal
+def : InstRW<[V3AEWrite_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
+
+// Load pair, signed immed offset, signed words
+def : InstRW<[V3AEWrite_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>;
+
+// Load pair, immed post-index or immed pre-index, signed words
+def : InstRW<[WriteAdr, V3AEWrite_5c_1I_3L, WriteLDHi],
+ (instregex "^LDPSW(post|pre)$")>;
+
+// §3.9 Store instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I.
+
+def : SchedAlias<WriteST, V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteSTIdx, V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteSTP, V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteAdr, V3AEWrite_1c_1I>;
+
+// §3.10 Tag load instructions
+// -----------------------------------------------------------------------------
+
+// Load allocation tag
+// Load multiple allocation tags
+def : InstRW<[V3AEWrite_4c_1L], (instrs LDG, LDGM)>;
+
+// §3.11 Tag store instructions
+// -----------------------------------------------------------------------------
+
+// Store allocation tags to one or two granules, post-index
+// Store allocation tags to one or two granules, pre-index
+// Store allocation tag to one or two granules, zeroing, post-index
+// Store Allocation Tag to one or two granules, zeroing, pre-index
+// Store allocation tag and reg pair to memory, post-Index
+// Store allocation tag and reg pair to memory, pre-Index
+def : InstRW<[V3AEWrite_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex,
+ ST2GPreIndex, ST2GPostIndex,
+ STZGPreIndex, STZGPostIndex,
+ STZ2GPreIndex, STZ2GPostIndex,
+ STGPpre, STGPpost)>;
+
+// Store allocation tags to one or two granules, signed offset
+// Store allocation tag to two granules, zeroing, signed offset
+// Store allocation tag and reg pair to memory, signed offset
+// Store multiple allocation tags
+def : InstRW<[V3AEWrite_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi,
+ STZ2Gi, STGPi, STGM, STZGM)>;
+
+// §3.12 FP data processing instructions
+// -----------------------------------------------------------------------------
+
+// FP absolute value
+// FP arithmetic
+// FP min/max
+// FP negate
+// FP select
+def : SchedAlias<WriteF, V3AEWrite_2c_1V>;
+
+// FP compare
+def : SchedAlias<WriteFCmp, V3AEWrite_2c_1V0>;
+
+// FP divide, square root
+def : SchedAlias<WriteFDiv, V3AEWrite_6c_1V1>;
+
+// FP divide, H-form
+def : InstRW<[V3AEWrite_6c_1V1], (instrs FDIVHrr)>;
+// FP divide, S-form
+def : InstRW<[V3AEWrite_8c_1V1], (instrs FDIVSrr)>;
+// FP divide, D-form
+def : InstRW<[V3AEWrite_13c_1V1], (instrs FDIVDrr)>;
+
+// FP square root, H-form
+def : InstRW<[V3AEWrite_6c_1V1], (instrs FSQRTHr)>;
+// FP square root, S-form
+def : InstRW<[V3AEWrite_8c_1V1], (instrs FSQRTSr)>;
+// FP square root, D-form
+def : InstRW<[V3AEWrite_13c_1V1], (instrs FSQRTDr)>;
+
+// FP multiply
+def : WriteRes<WriteFMul, [V3AEUnitV]> { let Latency = 3; }
+
+// FP multiply accumulate
+def : InstRW<[V3AEWr_FMA, ReadDefault, ReadDefault, V3AERd_FMA],
+ (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+
+// FP round to integral
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
+ "^FRINT(32|64)[XZ][SD]r$")>;
+
+// §3.13 FP miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// FP convert, from gen to vec reg
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
+
+// FP convert, from vec to gen reg
+def : InstRW<[V3AEWrite_3c_1V0],
+ (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>;
+
+// FP convert, Javascript from vec to gen reg
+def : SchedAlias<WriteFCvt, V3AEWrite_3c_1V0>;
+
+// FP convert, from vec to vec reg
+def : InstRW<[V3AEWrite_3c_1V], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr,
+ FCVTHDr, FCVTSDr, FCVTXNv1i64)>;
+
+// FP move, immed
+// FP move, register
+def : SchedAlias<WriteFImm, V3AEWrite_2c_1V>;
+
+// FP transfer, from gen to low half of vec reg
+def : InstRW<[V3AEWrite_0or3c_1M0],
+ (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
+
+// FP transfer, from gen to high half of vec reg
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instrs FMOVXDHighr)>;
+
+// FP transfer, from vec to gen reg
+def : SchedAlias<WriteFCopy, V3AEWrite_2c_2V>;
+
+// §3.14 FP load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector reg, literal, S/D/Q forms
+def : InstRW<[V3AEWrite_7c_1I_1L], (instregex "^LDR[SDQ]l$")>;
+
+// Load vector reg, unscaled immed
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDUR[BHSDQ]i$")>;
+
+// Load vector reg, immed post-index
+// Load vector reg, immed pre-index
+def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L],
+ (instregex "^LDR[BHSDQ](pre|post)$")>;
+
+// Load vector reg, unsigned immed
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDR[BHSDQ]ui$")>;
+
+// Load vector reg, register offset, basic
+// Load vector reg, register offset, scale, S/D-form
+// Load vector reg, register offset, scale, H/Q-form
+// Load vector reg, register offset, extend
+// Load vector reg, register offset, extend, scale, S/D-form
+// Load vector reg, register offset, extend, scale, H/Q-form
+def : InstRW<[V3AEWrite_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>;
+
+// Load vector pair, immed offset, S/D-form
+def : InstRW<[V3AEWrite_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
+
+// Load vector pair, immed offset, Q-form
+def : InstRW<[V3AEWrite_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
+
+// Load vector pair, immed post-index, S/D-form
+// Load vector pair, immed pre-index, S/D-form
+def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L, WriteLDHi],
+ (instregex "^LDP[SD](pre|post)$")>;
+
+// Load vector pair, immed post-index, Q-form
+// Load vector pair, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3AEWrite_6c_2I_2L, WriteLDHi], (instrs LDPQpost,
+ LDPQpre)>;
+
+// §3.15 FP store instructions
+// -----------------------------------------------------------------------------
+
+// Store vector reg, unscaled immed, B/H/S/D-form
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STUR[BHSDQ]i$")>;
+
+// Store vector reg, immed post-index, B/H/S/D-form
+// Store vector reg, immed post-index, Q-form
+// Store vector reg, immed pre-index, B/H/S/D-form
+// Store vector reg, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I],
+ (instregex "^STR[BHSDQ](pre|post)$")>;
+
+// Store vector reg, unsigned immed, B/H/S/D-form
+// Store vector reg, unsigned immed, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STR[BHSDQ]ui$")>;
+
+// Store vector reg, register offset, basic, B/H/S/D-form
+// Store vector reg, register offset, basic, Q-form
+// Store vector reg, register offset, scale, H-form
+// Store vector reg, register offset, scale, S/D-form
+// Store vector reg, register offset, scale, Q-form
+// Store vector reg, register offset, extend, B/H/S/D-form
+// Store vector reg, register offset, extend, Q-form
+// Store vector reg, register offset, extend, scale, H-form
+// Store vector reg, register offset, extend, scale, S/D-form
+// Store vector reg, register offset, extend, scale, Q-form
+def : InstRW<[V3AEWrite_StrHQ, ReadAdrBase],
+ (instregex "^STR[BHSDQ]ro[WX]$")>;
+
+// Store vector pair, immed offset, S-form
+// Store vector pair, immed offset, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STN?P[SD]i$")>;
+
+// Store vector pair, immed offset, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V], (instrs STPQi, STNPQi)>;
+
+// Store vector pair, immed post-index, S-form
+// Store vector pair, immed post-index, D-form
+// Store vector pair, immed pre-index, S-form
+// Store vector pair, immed pre-index, D-form
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I],
+ (instregex "^STP[SD](pre|post)$")>;
+
+// Store vector pair, immed post-index, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V_1I], (instrs STPQpost)>;
+
+// Store vector pair, immed pre-index, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V_2I], (instrs STPQpre)>;
+
+// §3.16 ASIMD integer instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD absolute diff
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD arith, pair-wise
+// ASIMD compare
+// ASIMD logical
+// ASIMD max/min, basic and pair-wise
+def : SchedAlias<WriteVd, V3AEWrite_2c_1V>;
+def : SchedAlias<WriteVq, V3AEWrite_2c_1V>;
+
+// ASIMD absolute diff accum
+// ASIMD absolute diff accum long
+def : InstRW<[V3AEWr_VA, V3AERd_VA], (instregex "^[SU]ABAL?v")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[V3AEWrite_5c_1V1_1V],
+ (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
+
+// ASIMD arith, reduce, 16B
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+
+// ASIMD dot product
+// ASIMD dot product using signed and unsigned integers
+def : InstRW<[V3AEWr_VDOT, V3AERd_VDOT],
+ (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V3AEWr_VMMA, V3AERd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+ "^[SU](MAX|MIN)Vv4i32v$")>;
+
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[V3AEWrite_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+ "^[SU](MAX|MIN)Vv8i16v$")>;
+
+// ASIMD max/min, reduce, 16B
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+
+// ASIMD multiply
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
+// ASIMD multiply accumulate
+def : InstRW<[V3AEWr_VMA, V3AERd_VMA], (instregex "^MLAv", "^MLSv")>;
+
+// ASIMD multiply accumulate high
+def : InstRW<[V3AEWr_VMAH, V3AERd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3AEWr_VMAL, V3AERd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDML[AS]L[iv]")>;
+
+// ASIMD multiply/multiply long (8x8) polynomial, D-form
+// ASIMD multiply/multiply long (8x8) polynomial, Q-form
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>;
+
+// ASIMD multiply long
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>;
+
+// ASIMD pairwise add and accumulate long
+def : InstRW<[V3AEWr_VPA, V3AERd_VPA], (instregex "^[SU]ADALPv")>;
+
+// ASIMD shift accumulate
+def : InstRW<[V3AEWr_VSA, V3AERd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv",
+ "^SSHLLv", "^SSHR[dv]", "^USHLLv",
+ "^USHR[dv]")>;
+
+// ASIMD shift by immed and insert, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[V3AEWrite_4c_1V],
+ (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$",
+ "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
+ "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]",
+ "^UQSHRN[bhsv]", "^URSHR[dv]")>;
+
+// ASIMD shift by register, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]SHLv")>;
+
+// ASIMD shift by register, complex
+def : InstRW<[V3AEWrite_4c_1V],
+ (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
+ "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
+
+// §3.17 ASIMD floating-point instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD FP absolute value/difference
+// ASIMD FP arith, normal
+// ASIMD FP compare
+// ASIMD FP complex add
+// ASIMD FP max/min, normal
+// ASIMD FP max/min, pairwise
+// ASIMD FP negate
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD FP complex multiply add
+def : InstRW<[V3AEWr_VFCMA, V3AERd_VFCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP convert, long (F16 to F32)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
+
+// ASIMD FP convert, long (F32 to F64)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTL(v2|v4)i32")>;
+
+// ASIMD FP convert, narrow (F32 to F16)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTN(v4|v8)i16")>;
+
+// ASIMD FP convert, narrow (F64 to F32)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTN(v2|v4)i32",
+ "^FCVTXN(v2|v4)f32")>;
+
+// ASIMD FP convert, other, D-form F32 and Q-form F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
+ "^FCVT[AMNPZ][SU]v2i(32|64)_shift$",
+ "^FCVT[AMNPZ][SU]v1i64$",
+ "^FCVTZ[SU]d$",
+ "^[SU]CVTFv2f(32|64)$",
+ "^[SU]CVTFv2i(32|64)_shift$",
+ "^[SU]CVTFv1i64$",
+ "^[SU]CVTFd$")>;
+
+// ASIMD FP convert, other, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
+ "^FCVT[AMNPZ][SU]v4i(16|32)_shift$",
+ "^FCVT[AMNPZ][SU]v1i32$",
+ "^FCVTZ[SU]s$",
+ "^[SU]CVTFv4f(16|32)$",
+ "^[SU]CVTFv4i(16|32)_shift$",
+ "^[SU]CVTFv1i32$",
+ "^[SU]CVTFs$")>;
+
+// ASIMD FP convert, other, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVT[AMNPZ][SU]v8f16$",
+ "^FCVT[AMNPZ][SU]v8i16_shift$",
+ "^FCVT[AMNPZ][SU]v1f16$",
+ "^FCVTZ[SU]h$",
+ "^[SU]CVTFv8f16$",
+ "^[SU]CVTFv8i16_shift$",
+ "^[SU]CVTFv1i16$",
+ "^[SU]CVTFh$")>;
+
+// ASIMD FP divide, D-form, F16
+def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FDIVv4f16)>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FDIVv8f16)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, reduce, F32 and D-form F16
+def : InstRW<[V3AEWrite_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
+
+// ASIMD FP max/min, reduce, Q-form F16
+def : InstRW<[V3AEWrite_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
+
+// ASIMD FP multiply
+def : InstRW<[V3AEWr_VFM], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[V3AEWr_VFMA, V3AERd_VFMA], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V3AEWr_VFMAL, V3AERd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>;
+
+// ASIMD FP round, D-form F32 and Q-form F64
+def : InstRW<[V3AEWrite_3c_1V0],
+ (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
+ "^FRINT(32|64)[XZ]v2f(32|64)$")>;
+
+// ASIMD FP round, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0],
+ (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
+ "^FRINT(32|64)[XZ]v4f32$")>;
+
+// ASIMD FP round, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
+
+// ASIMD FP square root, D-form, F16
+def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FSQRTv4f16)>;
+
+// ASIMD FP square root, D-form, F32
+def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FSQRTv2f32)>;
+
+// ASIMD FP square root, Q-form, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FSQRTv8f16)>;
+
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FSQRTv4f32)>;
+
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FSQRTv2f64)>;
+
+// §3.18 ASIMD BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD convert, F32 to BF16
+def : InstRW<[V3AEWrite_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
+
+// ASIMD dot product
+def : InstRW<[V3AEWr_VBFDOT, V3AERd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+
+// ASIMD matrix multiply accumulate
+def : InstRW<[V3AEWr_VBFMMA, V3AERd_VBFMMA], (instrs BFMMLA)>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3AEWr_VBFMAL, V3AERd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
+ BFMLALTIdx)>;
+
+// Scalar convert, F32 to BF16
+def : InstRW<[V3AEWrite_3c_1V0], (instrs BFCVT)>;
+
+// §3.19 ASIMD miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD bit reverse
+// ASIMD bitwise insert
+// ASIMD count
+// ASIMD duplicate, element
+// ASIMD extract
+// ASIMD extract narrow
+// ASIMD insert, element to element
+// ASIMD move, FP immed
+// ASIMD move, integer immed
+// ASIMD reverse
+// ASIMD table lookup extension, 1 table reg
+// ASIMD transpose
+// ASIMD unzip/zip
+// Handled by SchedAlias<WriteV[dq], ...>
+def : InstRW<[V3AEWrite_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>;
+
+// ASIMD duplicate, gen reg
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
+
+// ASIMD reciprocal and square root estimate, D-form U32
+def : InstRW<[V3AEWrite_3c_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form U32
+def : InstRW<[V3AEWrite_4c_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
+def : InstRW<[V3AEWrite_3c_1V0], (instrs FRECPEv1f16, FRECPEv1i32,
+ FRECPEv1i64, FRECPEv2f32,
+ FRSQRTEv1f16, FRSQRTEv1i32,
+ FRSQRTEv1i64, FRSQRTEv2f32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0], (instrs FRECPEv4f16, FRECPEv4f32,
+ FRSQRTEv4f16, FRSQRTEv4f32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
+
+// ASIMD reciprocal exponent
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRECPXv")>;
+
+// ASIMD reciprocal step
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^FRECPS(32|64|v)",
+ "^FRSQRTS(32|64|v)")>;
+
+// ASIMD table lookup, 1 or 2 table regs
+def : InstRW<[V3AEWrite_2c_1V], (instrs TBLv8i8One, TBLv16i8One,
+ TBLv8i8Two, TBLv16i8Two)>;
+
+// ASIMD table lookup, 3 table regs
+def : InstRW<[V3AEWrite_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
+
+// ASIMD table lookup, 4 table regs
+def : InstRW<[V3AEWrite_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>;
+
+// ASIMD table lookup extension, 2 table reg
+def : InstRW<[V3AEWrite_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// ASIMD table lookup extension, 3 table reg
+def : InstRW<[V3AEWrite_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>;
+
+// ASIMD table lookup extension, 4 table reg
+def : InstRW<[V3AEWrite_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[V3AEWrite_2c_2V], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
+
+// §3.20 ASIMD load instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1L],
+ (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1L],
+ (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2L],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2L],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3L],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3L],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4L],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4L],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 2 element, multiple, Q-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lane, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[V3AEWrite_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[V3AEWrite_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+// §3.21 ASIMD store instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3AEWrite_2c_3SA_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_3SA_3V], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3AEWrite_2c_4SA_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_4SA_4V], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_4c_2SA_4V], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_5c_2SA_4V], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_6c_3SA_6V], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3SA_6V], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[V3AEWrite_5c_2SA_4V], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_6c_2SA_6V], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2SA_6V], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+def : InstRW<[V3AEWrite_7c_4SA_12V], (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4SA_12V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_5c_4SA_8V], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_4SA_8V], (instregex "ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H/S
+def : InstRW<[V3AEWrite_6c_1SA_3V], (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1SA_3V], (instregex "ST4i(8|16|32)_POST$")>;
+
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[V3AEWrite_4c_2SA_4V], (instregex "ST4i(64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V], (instregex "ST4i(64)_POST$")>;
+
+// §3.22 Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[V3AEWrite_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3AEWrite_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>;
+
+// Crypto SM3 ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+ "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3AEWrite_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
+
+// §3.23 CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[V3AEWr_CRC, V3AERd_CRC], (instregex "^CRC32")>;
+
+// §3.24 SVE Predicate instructions
+// -----------------------------------------------------------------------------
+
+// Loop control, based on predicate
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP,
+ BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP,
+ BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
+ BRKPBS_PPzPP)>;
+
+// Loop control, based on GPR
+def : InstRW<[V3AEWrite_3c_2M],
+ (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
+def : InstRW<[V3AEWrite_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+
+// Loop terminate
+def : InstRW<[V3AEWrite_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
+
+// Predicate counting scalar
+def : InstRW<[V3AEWrite_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+def : InstRW<[V3AEWrite_2c_1M],
+ (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI",
+ "^SQ(DEC|INC)[BHWD]_XPiWdI",
+ "^UQ(DEC|INC)[BHWD]_WPiI")>;
+
+// Predicate counting scalar, ALL, {1,2,4}
+def : InstRW<[V3AEWrite_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[V3AEWrite_2c_1M],
+ (instregex "^CNTP_XPP_[BHSD]",
+ "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
+ "^(UQDEC|UQINC)P_WP_[BHSD]",
+ "^(SQDEC|SQINC)P_XPWd_[BHSD]")>;
+
+// Predicate counting vector, active predicate
+def : InstRW<[V3AEWrite_7c_1M_1M0_1V],
+ (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
+
+// Predicate logical
+def : InstRW<[V3AEWrite_1or2c_1M],
+ (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
+
+// Predicate logical, flag setting
+def : InstRW<[V3AEWrite_1or2c_1M],
+ (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
+
+// Predicate reverse
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^REV_PP_[BHSD]")>;
+
+// Predicate select
+def : InstRW<[V3AEWrite_1c_1M], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PTRUES_[BHSD]")>;
+
+// Predicate find first/next
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+
+// Predicate test
+def : InstRW<[V3AEWrite_1c_1M], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>;
+
+// Predicate unpack and widen
+def : InstRW<[V3AEWrite_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>;
+
+// §3.25 SVE integer instructions
+// -----------------------------------------------------------------------------
+
+// Arithmetic, absolute diff
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
+ "^[SU]ABD_ZPZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, basic
+def : InstRW<[V3AEWrite_2c_1V],
+ (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^(ADD|SUB)_ZZZ_[BHSD]",
+ "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
+ "^(ADD|SUB|SUBR)_ZI_[BHSD]",
+ "^ADR_[SU]XTW_ZZZ_D_[0123]",
+ "^ADR_LSL_ZZZ_[SD]_[0123]",
+ "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+ "^SADDLBT_ZZZ_[HSD]",
+ "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
+
+// Arithmetic, complex
+def : InstRW<[V3AEWrite_2c_1V],
+ (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
+ "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+ "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
+ "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
+ "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
+ "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, large integer
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[V3AEWr_ZPA, ReadDefault, V3AERd_ZPA],
+ (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
+
+// Arithmetic, shift
+def : InstRW<[V3AEWrite_2c_1V1],
+ (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
+ "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
+ "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
+ "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
+ "^(ASR|LSL|LSR)_ZZI_[BHSD]",
+ "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
+ "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[V3AEWr_ZSA, V3AERd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>;
+
+// Arithmetic, shift by immediate
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]",
+ "^[SU]SHLL[BT]_ZZI_[HSD]")>;
+
+// Arithmetic, shift by immediate and insert
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>;
+
+// Arithmetic, shift complex
+def : InstRW<[V3AEWrite_4c_1V],
+ (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
+ "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]",
+ "^[SU]QR?SHL_ZPZZ_[BHSD]",
+ "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
+ "^SQSHRU?N[BT]_ZZI_[BHS]",
+ "^UQR?SHRN[BT]_ZZI_[BHS]")>;
+
+// Arithmetic, shift right for divide
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]",
+ "^[SU]RSHL_ZPZZ_[BHSD]",
+ "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Bit manipulation
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>;
+
+// Bitwise select
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
+
+// Count/reverse bits
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
+
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[V3AEWrite_2c_1V], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[V3AEWrite_2or3c_1V0],
+ (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
+ "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
+
+// Complex add
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[V3AEWr_ZCMABHS, V3AERd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]",
+ "^CMLA_ZZZI_[HS]")>;
+
+// Complex multiply-add D element size
+def : InstRW<[V3AEWr_ZCMAD, V3AERd_ZCMAD], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[V3AEWrite_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
+ "^COMPACT_ZPZ_[SD]",
+ "^SPLICE_ZPZZ?_[BHSD]")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
+ "^[SU]CVTF_ZPmZ_StoD")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
+
+// Copy, scalar
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]",
+ "^CPY_ZPzI_[BHSD]")>;
+
+// Divides, 32 bit
+def : InstRW<[V3AEWrite_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
+ "^[SU]DIV_ZPZZ_S")>;
+
+// Divides, 64 bit
+def : InstRW<[V3AEWrite_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
+ "^[SU]DIV_ZPZZ_D")>;
+
+// Dot product, 8 bit
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^DUP_ZI_[BHSD]",
+ "^DUP_ZZI_[BHSDQ]")>;
+
+// Duplicate, scalar form
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>;
+
+// Extend, sign or zero
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
+ "^[SU]XTH_ZPmZ_[SD]",
+ "^[SU]XTW_ZPmZ_[D]")>;
+
+// Extract
+def : InstRW<[V3AEWrite_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
+ "^SQXTUN[BT]_ZZ_[BHS]")>;
+
+// Extract operation, SIMD and FP scalar form
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>;
+
+// Extract operation, scalar
+def : InstRW<[V3AEWrite_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>;
+
+// Histogram operations
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]",
+ "^HISTSEG_ZZZ")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^INDEX_II_[BHS]")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[V3AEWrite_7c_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[V3AEWrite_5c_2V0], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[V3AEWrite_8c_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D")>;
+
+// insert operation, SIMD and FP scalar form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^INSR_ZV_[BHSD]")>;
+
+// insert operation, scalar
+def : InstRW<[V3AEWrite_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>;
+
+// Logical
+def : InstRW<[V3AEWrite_2c_1V],
+ (instregex "^(AND|EOR|ORR)_ZI",
+ "^(AND|BIC|EOR|ORR)_ZZZ",
+ "^EOR(BT|TB)_ZZZ_[BHSD]",
+ "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]",
+ "^NOT_ZPmZ_[BHSD]")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
+ "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]",
+ "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>;
+
+// Matching operations
+// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the
+// latency for this instruction is 4 cycles.
+def : InstRW<[V3AEWrite_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[V3AEWr_ZMMA, V3AERd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
+ "^MOVPRFX_ZZ")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
+ "^MUL_ZPZZ_[BHS]",
+ "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
+ "^[SU]MULH_ZPZZ_[BHS]")>;
+
+// Multiply, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
+ "^MUL_ZPZZ_D",
+ "^[SU]MULH_(ZPmZ|ZZZ)_D",
+ "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
+ "^[SU]MULL[BT]_ZZZ_[HSD]")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[V3AEWr_ZMABHS, V3AERd_ZMABHS],
+ (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>;
+def : InstRW<[V3AEWr_ZMABHS, ReadDefault, V3AERd_ZMABHS],
+ (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V3AEWr_ZMAD, V3AERd_ZMAD],
+ (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V3AEWr_ZMAD, ReadDefault, V3AERd_ZMAD],
+ (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
+
+// Multiply accumulate long
+def : InstRW<[V3AEWr_ZMAL, V3AERd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
+ "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[V3AEWr_ZMASQL, V3AERd_ZMASQ],
+ (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]",
+ "^SQDML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]",
+ "^SQDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
+ "^SQDMULL[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[V3AEWr_ZMASQBHS, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
+ "^SQRDCMLAH_ZZZ_[BHS]",
+ "^SQRDML[AS]H_ZZZI_[HS]",
+ "^SQRDCMLAH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[V3AEWr_ZMASQD, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D",
+ "^SQRDCMLAH_ZZZ_D")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]",
+ "^SQRDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instregex "^SQRDMULH_ZZZI?_D")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^PMUL_ZZZ_B",
+ "^PMULL[BT]_ZZZ_[HDQ]")>;
+
+// Predicate counting vector
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>;
+
+// Reciprocal estimate
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[V3AEWrite_9c_2V_4V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[V3AEWrite_8c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[V3AEWrite_6c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[V3AEWrite_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[V3AEWrite_6c_1V_1V1], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>;
+
+// Reverse, vector
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^REV_ZZ_[BHSD]",
+ "^REVB_ZPmZ_[HSD]",
+ "^REVH_ZPmZ_[SD]",
+ "^REVW_ZPmZ_D")>;
+
+// Select, vector form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>;
+
+// Table lookup
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>;
+
+// Table lookup extension
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>;
+
+// Transpose, vector form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+
+// Unpack and extend
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
+
+// Zip/unzip
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+
+// §3.26 SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
+ "^FABD_ZPZZ_[HSD]",
+ "^FABS_ZPmZ_[HSD]")>;
+
+// Floating point arithmetic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
+ "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
+ "^FADDP_ZPmZZ_[HSD]",
+ "^FNEG_ZPmZ_[HSD]",
+ "^FSUBR_ZPm[IZ]_[HSD]",
+ "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
+
+// Floating point associative add, F16
+def : InstRW<[V3AEWrite_10c_1V1_9rc], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[V3AEWrite_6c_1V1_5rc], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[V3AEWrite_4c_1V], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+ "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
+ "^FCM(LE|LT)_PPzZ0_[HSD]",
+ "^FCMUO_PPzZZ_[HSD]")>;
+
+// Floating point complex add
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>;
+
+// Floating point complex multiply add
+def : InstRW<[V3AEWr_ZFCMA, ReadDefault, V3AERd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V3AEWr_ZFCMA, V3AERd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
+ "^FCVTLT_ZPmZ_HtoS",
+ "^FCVTNT_ZPmZ_StoH")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
+ "^FCVTLT_ZPmZ_StoD",
+ "^FCVTNT_ZPmZ_DtoS")>;
+
+// Floating point convert, round to odd
+def : InstRW<[V3AEWrite_3c_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
+
+// Floating point base2 log, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point base2 log, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point base2 log, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point convert to integer, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[V3AEWrite_3c_1V0],
+ (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
+
+// Floating point copy
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^FCPY_ZPmI_[HSD]",
+ "^FDUP_ZI_[HSD]")>;
+
+// Floating point divide, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point divide, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point divide, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point min/max pairwise
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+
+// Floating point min/max
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
+ "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
+ "^FMULX_ZPZZ_[HSD]",
+ "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
+ "^FMUL_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply accumulate
+def : InstRW<[V3AEWr_ZFMA, ReadDefault, V3AERd_ZFMA],
+ (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+ "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V3AEWr_ZFMA, V3AERd_ZFMA],
+ (instregex "^FML[AS]_ZZZI_[HSD]",
+ "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[V3AEWr_ZFMAL, V3AERd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>;
+
+// Floating point reciprocal estimate, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>;
+
+// Floating point reciprocal step
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
+
+// Floating point reduction, F16
+def : InstRW<[V3AEWrite_8c_4V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>;
+
+// Floating point reduction, F32
+def : InstRW<[V3AEWrite_6c_3V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>;
+
+// Floating point reduction, F64
+def : InstRW<[V3AEWrite_4c_2V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>;
+
+// Floating point round to integral, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
+
+// Floating point round to integral, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
+
+// Floating point round to integral, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
+
+// Floating point square root, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>;
+
+// Floating point square root, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>;
+
+// Floating point square root, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>;
+
+// §3.27 SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[V3AEWrite_4c_1V], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[V3AEWr_ZBFDOT, V3AERd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[V3AEWr_ZBFMMA, V3AERd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
+
+// Multiply accumulate long
+def : InstRW<[V3AEWr_ZBFMAL, V3AERd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>;
+
+// §3.28 SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[V3AEWrite_6c_1L], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[V3AEWrite_6c_1L_1M], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]$",
+ "^LD1S?B_[HSD]$",
+ "^LD1S?H_[SD]$",
+ "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1R[BHWD]_IMM$",
+ "^LD1RS?B_[HSD]_IMM$",
+ "^LD1RS?H_[SD]_IMM$",
+ "^LD1RW_D_IMM$",
+ "^LD1RSW_IMM$",
+ "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+// Non temporal load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[V3AEWrite_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$",
+ "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[V3AEWrite_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[V3AEWrite_9c_2L_2V], (instrs LDNT1D_ZZR_D)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L_1I], (instregex "^LDFF1[BHWD]$",
+ "^LDFF1S?B_[HSD]$",
+ "^LDFF1S?H_[SD]$",
+ "^LDFF1S?W_D$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNF1[BHWD]_IMM$",
+ "^LDNF1S?B_[HSD]_IMM$",
+ "^LDNF1S?H_[SD]_IMM$",
+ "^LDNF1S?W_D_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[V3AEWrite_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[V3AEWrite_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[V3AEWrite_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[V3AEWrite_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[V3AEWrite_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[V3AEWrite_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
+ "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+ "^GLD(FF)?1D_IMM$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[V3AEWrite_10c_1L_8V],
+ (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$",
+ "^GLD(FF)?1W_[SU]XTW_SCALED")>;
+
+// Gather load, 64-bit scaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3AEWrite_10c_1L_4V],
+ (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$",
+ "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+ "^GLD(FF)?1W_[SU]XTW$")>;
+
+// Gather load, 64-bit unpacked unscaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3AEWrite_9c_1L_2V],
+ (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$",
+ "^GLD(FF)?1D(_[SU]XTW)?$")>;
+
+// §3.29 SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[V3AEWrite_1c_1SA], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BHWD]_IMM$",
+ "^ST1B_[HSD]_IMM$",
+ "^ST1H_[SD]_IMM$",
+ "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BWD]$",
+ "^ST1B_[HSD]$",
+ "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[V3AEWrite_4c_1SA_1V], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[V3AEWrite_4c_2SA_2I_2V], (instrs ST2H)>;
+def : InstRW<[V3AEWrite_4c_2SA_2V], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[V3AEWrite_7c_9SA_9V], (instregex "^ST3[BHWD]_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[V3AEWrite_7c_9SA_9I_9V], (instregex "^ST3[BHWD]$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[V3AEWrite_11c_18SA_18V], (instregex "^ST4[BHWD]_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[V3AEWrite_11c_18SA_18I_18V], (instregex "^ST4[BHWD]$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instrs STNT1H_ZRR)>;
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_IMM$",
+ "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_IMM$",
+ "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[V3AEWrite_4c_4SA_4V],
+ (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_[SU]XTW$",
+ "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+ "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_[SU]XTW$",
+ "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_SCALED$",
+ "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D$",
+ "^SST1D$")>;
+
+// §3.30 SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[V3AEWrite_2c_1M0], (instrs RDFFR_P)>;
+
+// Read first fault register, predicated
+def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFR_PPz)>;
+
+// Read first fault register and set flags
+def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[V3AEWrite_2c_1M0], (instrs SETFFR, WRFFR)>;
+
+// Prefetch
+// NOTE: This is not specified in the SOG.
+def : InstRW<[V3AEWrite_4c_1L], (instregex "^PRF[BHWD]")>;
+
+// §3.31 SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]_ZZZ_B$",
+ "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$",
+ "^RAX1_ZZZ_D$",
+ "^XAR_ZZZI_[BHSD]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5b80b08..068954f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -764,8 +764,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
}
void AArch64PassConfig::addMachineSSAOptimization() {
- if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
- addPass(createMachineSMEABIPass());
+ if (TM->getOptLevel() != CodeGenOptLevel::None && EnableNewSMEABILowering)
+ addPass(createMachineSMEABIPass(TM->getOptLevel()));
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
addPass(createSMEPeepholeOptPass());
@@ -798,7 +798,7 @@ bool AArch64PassConfig::addILPOpts() {
void AArch64PassConfig::addPreRegAlloc() {
if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
- addPass(createMachineSMEABIPass());
+ addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOptLevel::None &&
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 434ea67..7cb5003 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -121,8 +121,10 @@ struct InstInfo {
/// Contains the needed ZA state for each instruction in a block. Instructions
/// that do not require a ZA state are not recorded.
struct BlockInfo {
- ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
+ ZAState FixedEntryState{ZAState::ANY};
+ ZAState DesiredIncomingState{ZAState::ANY};
+ ZAState DesiredOutgoingState{ZAState::ANY};
LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
@@ -175,10 +177,15 @@ private:
Register AgnosticZABufferPtr = AArch64::NoRegister;
};
+/// Checks if \p State is a legal edge bundle state. For a state to be a legal
+/// bundle state, it must be possible to transition from it to any other bundle
+/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED,
+/// as you can transition between those states by saving/restoring ZA. The OFF
+/// state would not be legal, as transitioning to it drops the content of ZA.
static bool isLegalEdgeBundleZAState(ZAState State) {
switch (State) {
- case ZAState::ACTIVE:
- case ZAState::LOCAL_SAVED:
+ case ZAState::ACTIVE: // ZA state within the accumulator/ZT0.
+ case ZAState::LOCAL_SAVED: // ZA state is saved on the stack.
return true;
default:
return false;
@@ -238,7 +245,8 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
struct MachineSMEABI : public MachineFunctionPass {
inline static char ID = 0;
- MachineSMEABI() : MachineFunctionPass(ID) {}
+ MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+ : MachineFunctionPass(ID), OptLevel(OptLevel) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -267,6 +275,11 @@ struct MachineSMEABI : public MachineFunctionPass {
const EdgeBundles &Bundles,
ArrayRef<ZAState> BundleStates);
+ /// Propagates desired states forwards (from predecessors -> successors) if
+ /// \p Forwards, otherwise, propagates backwards (from successors ->
+ /// predecessors).
+ void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
+
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
@@ -335,12 +348,15 @@ struct MachineSMEABI : public MachineFunctionPass {
MachineBasicBlock::iterator MBBI, DebugLoc DL);
private:
+ CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
+
MachineFunction *MF = nullptr;
const AArch64Subtarget *Subtarget = nullptr;
const AArch64RegisterInfo *TRI = nullptr;
const AArch64FunctionInfo *AFI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ MachineLoopInfo *MLI = nullptr;
};
static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
@@ -422,12 +438,69 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// Reverse vector (as we had to iterate backwards for liveness).
std::reverse(Block.Insts.begin(), Block.Insts.end());
+
+ // Record the desired states on entry/exit of this block. These are the
+ // states that would not incur a state transition.
+ if (!Block.Insts.empty()) {
+ Block.DesiredIncomingState = Block.Insts.front().NeededState;
+ Block.DesiredOutgoingState = Block.Insts.back().NeededState;
+ }
}
return FunctionInfo{std::move(Blocks), AfterSMEProloguePt,
PhysLiveRegsAfterSMEPrologue};
}
+void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo,
+ bool Forwards) {
+ // If `Forwards`, this propagates desired states from predecessors to
+ // successors, otherwise, this propagates states from successors to
+ // predecessors.
+ auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
+ return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
+ };
+
+ SmallVector<MachineBasicBlock *> Worklist;
+ for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) {
+ if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
+ Worklist.push_back(MF->getBlockNumbered(BlockID));
+ }
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
+ BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()];
+
+ // Pick a legal edge bundle state that matches the majority of
+ // predecessors/successors.
+ int StateCounts[ZAState::NUM_ZA_STATE] = {0};
+ for (MachineBasicBlock *PredOrSucc :
+ Forwards ? predecessors(MBB) : successors(MBB)) {
+ BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()];
+ ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards);
+ if (isLegalEdgeBundleZAState(ZAState))
+ StateCounts[ZAState]++;
+ }
+
+ ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
+ ZAState &CurrentState = GetBlockState(Block, Forwards);
+ if (PropagatedState != CurrentState) {
+ CurrentState = PropagatedState;
+ ZAState &OtherState = GetBlockState(Block, !Forwards);
+ // Propagate to the incoming/outgoing state if that is also "ANY".
+ if (OtherState == ZAState::ANY)
+ OtherState = PropagatedState;
+ // Push any successors/predecessors that may need updating to the
+ // worklist.
+ for (MachineBasicBlock *SuccOrPred :
+ Forwards ? successors(MBB) : predecessors(MBB)) {
+ BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()];
+ if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards)))
+ Worklist.push_back(SuccOrPred);
+ }
+ }
+ }
+}
+
/// Assigns each edge bundle a ZA state based on the needed states of blocks
/// that have incoming or outgoing edges in that bundle.
SmallVector<ZAState>
@@ -440,40 +513,36 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
// Attempt to assign a ZA state for this bundle that minimizes state
// transitions. Edges within loops are given a higher weight as we assume
// they will be executed more than once.
- // TODO: We should propagate desired incoming/outgoing states through blocks
- // that have the "ANY" state first to make better global decisions.
int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
for (unsigned BlockID : Bundles.getBlocks(I)) {
LLVM_DEBUG(dbgs() << "- bb." << BlockID);
const BlockInfo &Block = FnInfo.Blocks[BlockID];
- if (Block.Insts.empty()) {
- LLVM_DEBUG(dbgs() << " (no state preference)\n");
- continue;
- }
bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
- ZAState DesiredIncomingState = Block.Insts.front().NeededState;
- if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
- EdgeStateCounts[DesiredIncomingState]++;
+ bool LegalInEdge =
+ InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
+ bool LegalOutEgde =
+ OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
+ if (LegalInEdge) {
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
- << getZAStateString(DesiredIncomingState));
+ << getZAStateString(Block.DesiredIncomingState));
+ EdgeStateCounts[Block.DesiredIncomingState]++;
}
- ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
- if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
- EdgeStateCounts[DesiredOutgoingState]++;
+ if (LegalOutEgde) {
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
- << getZAStateString(DesiredOutgoingState));
+ << getZAStateString(Block.DesiredOutgoingState));
+ EdgeStateCounts[Block.DesiredOutgoingState]++;
}
+ if (!LegalInEdge && !LegalOutEgde)
+ LLVM_DEBUG(dbgs() << " (no state preference)");
LLVM_DEBUG(dbgs() << '\n');
}
ZAState BundleState =
ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
- // Force ZA to be active in bundles that don't have a preferred state.
- // TODO: Something better here (to avoid extra mode switches).
if (BundleState == ZAState::ANY)
BundleState = ZAState::ACTIVE;
@@ -918,6 +987,43 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
+
+ if (OptLevel != CodeGenOptLevel::None) {
+ // Propagate desired states forward, then backwards. Most of the propagation
+ // should be done in the forward step, and backwards propagation is then
+ // used to fill in the gaps. Note: Doing both in one step can give poor
+ // results. For example, consider this subgraph:
+ //
+ // ┌─────┐
+ // ┌─┤ BB0 ◄───┐
+ // │ └─┬───┘ │
+ // │ ┌─▼───◄──┐│
+ // │ │ BB1 │ ││
+ // │ └─┬┬──┘ ││
+ // │ │└─────┘│
+ // │ ┌─▼───┐ │
+ // │ │ BB2 ├───┘
+ // │ └─┬───┘
+ // │ ┌─▼───┐
+ // └─► BB3 │
+ // └─────┘
+ //
+ // If:
+ // - "BB0" and "BB2" (outer loop) has no state preference
+ // - "BB1" (inner loop) desires the ACTIVE state on entry/exit
+ // - "BB3" desires the LOCAL_SAVED state on entry
+ //
+ // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2,
+ // then from BB2 to BB0. Which results in the inner and outer loops having
+ // the "ACTIVE" state. This avoids any state changes in the loops.
+ //
+ // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from
+ // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED
+ // in the outer loop.
+ for (bool Forwards : {true, false})
+ propagateDesiredStates(FnInfo, Forwards);
+ }
+
SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
EmitContext Context;
@@ -941,4 +1047,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
+FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
+ return new MachineSMEABI(OptLevel);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5..cd8b249 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,9 +562,13 @@ public:
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 0eb00cb..529da8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
struct ImageDimIntrinsicInfo {
unsigned Intr;
unsigned BaseOpcode;
+ unsigned AtomicNoRetBaseOpcode;
MIMGDim Dim;
uint8_t NumOffsetArgs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 97c2c9c..9ce1224 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
+ unsigned IntrOpcode = Intr->BaseOpcode;
+
+ // For image atomic: use no-return opcode if result is unused.
+ if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
+ Register ResultDef = MI.getOperand(0).getReg();
+ if (MRI->use_nodbg_empty(ResultDef))
+ IntrOpcode = Intr->AtomicNoRetBaseOpcode;
+ }
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
- Register VDataIn, VDataOut;
+ Register VDataIn = AMDGPU::NoRegister;
+ Register VDataOut = AMDGPU::NoRegister;
LLT VDataTy;
int NumVDataDwords = -1;
bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned DMaskLanes = 0;
if (BaseOpcode->Atomic) {
- VDataOut = MI.getOperand(0).getReg();
+ if (!BaseOpcode->NoReturn)
+ VDataOut = MI.getOperand(0).getReg();
VDataIn = MI.getOperand(2).getReg();
LLT Ty = MRI->getType(VDataIn);
@@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
- if (BaseOpcode->Atomic)
- CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ // Keep GLC only when the atomic's result is actually used.
+ if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+ CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074ea..bf6f1a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6214f4d..75a94ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
-
- if (EnableUniformIntrinsicCombine)
- PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
@@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8..65e6ed9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
- ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
- ICmp->eraseFromParent();
Changed = true;
}
}
@@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- for (Function &F : M) {
- switch (F.getIntrinsicID()) {
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ continue;
+
+ switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
@@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
default:
continue;
}
-
- for (User *U : make_early_inc_range(F.users())) {
- auto *II = cast<IntrinsicInst>(U);
- Function *ParentF = II->getFunction();
- const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
- IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
- }
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
}
PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+ if (!runUniformIntrinsicCombine(F, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+private:
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+ AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ const UniformityInfo &UI =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+ return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a911e7e..52cc4ca 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
assert(!ST.hasExtendedWaitCounts());
- if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
+ if (!ST.isWave64())
+ return false;
+
+ const bool IsSALU = SIInstrInfo::isSALU(*MI);
+ const bool IsVALU = SIInstrInfo::isVALU(*MI);
+ if (!IsSALU && !IsVALU)
return false;
// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
- // 2. SALU writes SGPR
- // 3. SALU reads SGPR
- // The hazard can expire if the distance between 2 and 3 is sufficient.
- // In practice this happens <10% of the time, hence this always assumes
- // the hazard exists if 1 and 2 are present to avoid searching.
+ // 2. VALU/SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // The hazard can expire if the distance between 2 and 3 is sufficient,
+ // or (2) is VALU and (3) is SALU.
+ // In practice this happens <10% of the time, hence always assume the hazard
+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
- const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
- if (!SDSTOp || !SDSTOp->isReg())
- return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IgnoreableSGPR = [](const Register Reg) {
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
+ case AMDGPU::SCC:
+ return true;
+ default:
+ return false;
+ }
+ };
+ auto IsVCC = [](const Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+ };
+
+ struct StateType {
+ SmallSet<Register, 2> HazardSGPRs;
+
+ static unsigned getHashValue(const StateType &State) {
+ return hash_combine_range(State.HazardSGPRs);
+ }
+ static bool isEqual(const StateType &LHS, const StateType &RHS) {
+ return LHS.HazardSGPRs == RHS.HazardSGPRs;
+ }
+ };
+
+ SmallVector<const MachineInstr *> WaitInstrs;
+ bool HasSGPRRead = false;
+ StateType InitialState;
+
+ // Look for SGPR write.
+ MachineOperand *HazardDef = nullptr;
+ for (MachineOperand &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef() && HazardDef)
+ continue;
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
+ continue;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ // Also check for SGPR reads.
+ if (Op.isUse()) {
+ HasSGPRRead = true;
+ continue;
+ }
+
+ assert(!HazardDef);
+ HazardDef = &Op;
+ }
- const Register HazardReg = SDSTOp->getReg();
- if (HazardReg == AMDGPU::EXEC ||
- HazardReg == AMDGPU::EXEC_LO ||
- HazardReg == AMDGPU::EXEC_HI ||
- HazardReg == AMDGPU::M0)
+ if (!HazardDef)
return false;
- auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+ // Setup to track writes to individual SGPRs
+ const Register HazardReg = HazardDef->getReg();
+ if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
+ InitialState.HazardSGPRs.insert(HazardReg);
+ } else {
+ assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
+ }
+
+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
+ if (State.HazardSGPRs.empty())
+ return HazardExpired;
+
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
- case AMDGPU::V_SUBBREV_U32_dpp:
+ case AMDGPU::V_SUBBREV_U32_dpp: {
// These implicitly read VCC as mask source.
- return HazardReg == AMDGPU::VCC ||
- HazardReg == AMDGPU::VCC_LO ||
- HazardReg == AMDGPU::VCC_HI;
+ return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
+ }
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
- return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+ bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
+ return Result ? HazardFound : NoHazardFound;
}
default:
- return false;
+ return NoHazardFound;
}
};
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
- if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
- return true;
-
- // VALU access to any SGPR or literal constant other than HazardReg
- // mitigates hazard. No need to check HazardReg here as this will
- // only be called when !IsHazardFn.
- if (!SIInstrInfo::isVALU(I))
- return false;
- for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
- const MachineOperand &Op = I.getOperand(OpNo);
- if (Op.isReg()) {
- Register OpReg = Op.getReg();
- // Only consider uses
- if (!Op.isUse())
+ const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
+ 0);
+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
+ switch (I.getOpcode()) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ // Record mergable waits within region of instructions free of SGPR reads.
+ if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
+ (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
+ WaitInstrs.push_back(&I);
+ break;
+ default:
+ // Update tracking of SGPR reads and writes.
+ for (auto &Op : I.operands()) {
+ if (!Op.isReg())
continue;
- // Ignore EXEC
- if (OpReg == AMDGPU::EXEC ||
- OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::EXEC_HI)
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
continue;
- // Ignore all implicit uses except VCC
- if (Op.isImplicit()) {
- if (OpReg == AMDGPU::VCC ||
- OpReg == AMDGPU::VCC_LO ||
- OpReg == AMDGPU::VCC_HI)
- return true;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ if (Op.isUse()) {
+ HasSGPRRead = true;
continue;
}
- if (TRI.isSGPRReg(MRI, OpReg))
- return true;
- } else {
- const MCInstrDesc &InstDesc = I.getDesc();
- const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
- if (!TII.isInlineConstant(Op, OpInfo))
- return true;
+
+ // Stop tracking any SGPRs with writes on the basis that they will
+ // already have an appropriate wait inserted afterwards.
+ SmallVector<Register, 2> Found;
+ for (Register SGPR : State.HazardSGPRs) {
+ if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
+ Found.push_back(SGPR);
+ }
+ for (Register SGPR : Found)
+ State.HazardSGPRs.erase(SGPR);
}
+ break;
}
- return false;
};
// Check for hazard
- if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
- std::numeric_limits<int>::max())
+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
+ MI->getParent(),
+ std::next(MI->getReverseIterator())))
return false;
- auto NextMI = std::next(MI->getIterator());
+ // Compute counter mask
+ unsigned DepCtr =
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0);
+
+ // Try to merge previous waits into this one for regions with no SGPR reads.
+ if (!WaitInstrs.empty()) {
+ // Note: WaitInstrs contains const pointers, so walk backward from MI to
+ // obtain a mutable pointer to each instruction to be merged.
+ // This is expected to be a very short walk within the same block.
+ SmallVector<MachineInstr *> ToErase;
+ unsigned Found = 0;
+ for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
+ End = MI->getParent()->rend();
+ Found < WaitInstrs.size() && It != End; ++It) {
+ MachineInstr *WaitMI = &*It;
+ // Find next wait instruction.
+ if (std::as_const(WaitMI) != WaitInstrs[Found])
+ continue;
+ Found++;
+ unsigned WaitMask = WaitMI->getOperand(0).getImm();
+ assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
+ ToErase.push_back(WaitMI);
+ }
+ assert(Found == WaitInstrs.size());
+ for (MachineInstr *WaitMI : ToErase)
+ WaitMI->eraseFromParent();
+ }
- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+ // Add s_waitcnt_depctr after SGPR write.
+ auto NextMI = std::next(MI->getIterator());
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(DepCtr);
// SALU write may be s_getpc in a bundle.
updateGetPCBundle(NewMI);
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5f6d742..d950131 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
}
class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, string dns="">
- : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
- let Constraints = "$vdst = $vdata";
-
+ RegisterClass addr_rc, bit noRtn, string dns="">
+ : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> {
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
}
class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, string dns="">
- : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
- let Constraints = "$vdst = $vdata";
-
+ RegisterClass addr_rc, bit noRtn, string dns="">
+ : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> {
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
}
class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn,
!if(enableDasm, "GFX6GFX7", "")> {
let AssemblerPredicate = isGFX6GFX7;
}
class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> {
let AssemblerPredicate = isGFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx8;
}
class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
- RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+ RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+ : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> {
let AssemblerPredicate = isGFX90APlus;
let MIMGEncoding = MIMGEncGfx90a;
}
class MIMG_Atomic_gfx10<mimgopc op, string opcode,
RegisterOperand DataRC, RegisterClass AddrRC,
- bit enableDisasm = 0>
- : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)),
!if(enableDisasm, "GFX10", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
RegisterOperand DataRC, int num_addrs,
- bit enableDisasm = 0>
- : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX10", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
class MIMG_Atomic_gfx11<mimgopc op, string opcode,
RegisterOperand DataRC, RegisterClass AddrRC,
- bit enableDisasm = 0>
- : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)),
!if(enableDisasm, "GFX11", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
RegisterOperand DataRC, int num_addrs,
- bit enableDisasm = 0>
- : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
+ bit noRtn = 0, bit enableDisasm = 0>
+ : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX11", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
}
class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
- int num_addrs, string renamed, bit enableDisasm = 0>
- : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
+ int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0>
+ : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
!if(enableDisasm, "GFX12", "")> {
- let Constraints = "$vdst = $vdata";
-
+ let Constraints = !if(noRtn, "", "$vdst = $vdata");
+ let isCodeGenOnly = noRtn;
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
@@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
RegisterOperand data_rc,
bit enableDasm = 0,
bit isFP = 0,
+ bit noRtn = 0,
string renamed = ""> {
let hasSideEffects = 1, // FIXME: remove this
mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
- FPAtomic = isFP in {
+ FPAtomic = isFP, IsAtomicNoRet = noRtn in {
let VAddrDwords = 1 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_VI then {
- def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
let hasPostISelHook = 1 in
- def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_GFX10M then {
- def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
if op.HAS_GFX11 then {
- def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
}
}
if op.HAS_GFX12 then {
- def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>;
+ def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>;
}
}
let VAddrDwords = 2 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>;
}
if op.HAS_VI then {
- def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>;
}
if op.HAS_GFX11 then {
- def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+ def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>;
+ def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>;
}
}
if op.HAS_GFX12 then {
- def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>;
+ def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>;
}
}
let VAddrDwords = 3 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>;
}
if op.HAS_VI then {
- def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>;
}
if op.HAS_GFX11 then {
- def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+ def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>;
+ def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>;
}
}
if op.HAS_GFX12 then {
- def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>;
+ def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>;
}
}
let VAddrDwords = 4 in {
let ssamp = 0 in {
if op.HAS_SI then {
- def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>;
}
if op.HAS_VI then {
- def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>;
}
if op.HAS_GFX10M then {
- def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>;
}
if op.HAS_GFX11 then {
- def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+ def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>;
+ def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>;
}
}
if op.HAS_GFX12 then {
- def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>;
+ def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>;
}
}
}
@@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
}
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
- string renamed = ""> { // 64-bit atomics
- let IsAtomicRet = 1 in {
+multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+ bit noRtn = 0, string renamed = ""> { // 64-bit atomics
+ let IsAtomicRet = !not(noRtn) in {
def "" : MIMGBaseOpcode {
let Atomic = 1;
let AtomicX2 = isCmpSwap;
+ let NoReturn = noRtn;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
// Other variants are reconstructed by disassembler using dmask and tfe.
if !not(isCmpSwap) then {
let VDataDwords = 1 in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>;
}
let VDataDwords = 2 in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
+ defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>;
if isCmpSwap then {
let VDataDwords = 4 in
- defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
+ defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
+ defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>;
}
}
- } // End IsAtomicRet = 1
+ }
+}
+
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+ string renamed = ""> {
+ defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>;
+ defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>;
}
multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
@@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in {
class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
Intrinsic Intr = I;
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+ MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode;
AMDGPUDimProps Dim = I.P.Dim;
AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
@@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
}
+class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I>
+ : ImageDimIntrinsicInfo<I> {
+ MIMGBaseOpcode AtomicNoRetBaseOpcode =
+ !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN"));
+}
+
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
- let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
- "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
- "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+ let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData",
+ "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex",
+ "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
"BiasTyArg", "GradientTyArg", "CoordTyArg"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+ string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode";
string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Intr"];
@@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex {
let Key = ["BaseOpcode", "Dim"];
}
-foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
- AMDGPUImageDimAtomicIntrinsics) in {
+foreach intr = AMDGPUImageDimIntrinsics in {
def : ImageDimIntrinsicInfo<intr>;
}
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+ def : ImageDimAtomicIntrinsicInfo<intr>;
+}
+
// L to LZ Optimization Mapping
def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be42291..b34ab2a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ unsigned IntrOpcode = Intr->BaseOpcode;
+ // For image atomic: use no-return opcode if result is unused.
+ if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
+ !Op.getNode()->hasAnyUseOfValue(0))
+ IntrOpcode = Intr->AtomicNoRetBaseOpcode;
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
+ if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
+ ResultTypes.erase(&ResultTypes[0]);
+
bool IsD16 = false;
bool IsG16 = false;
bool IsA16 = false;
@@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VData = Op.getOperand(2);
IsAtomicPacked16Bit =
- (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
- Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+ (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
+ IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
@@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (Is64Bit)
VData = DAG.getBitcast(MVT::v4i32, VData);
- ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+ if (!BaseOpcode->NoReturn)
+ ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+
DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
} else {
@@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
- if (BaseOpcode->Atomic)
- CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ // Keep GLC only when the atomic's result is actually used.
+ if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+ CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return Op;
@@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
}
+ if (BaseOpcode->NoReturn) {
+ if (BaseOpcode->Atomic)
+ return DAG.getMergeValues(
+ {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
+
+ return SDValue(NewNode, 0);
+ }
+
if (BaseOpcode->AtomicX2) {
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
}
- if (BaseOpcode->NoReturn)
- return SDValue(NewNode, 0);
+
return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
NumVDataDwords, IsAtomicPacked16Bit, DL);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d80a6f3..a6c1af2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore(
}
}
+ Register FinalValueReg = ValueReg;
+ if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
+ // If we are loading 16-bit value with SRAMECC endabled we need a temp
+ // 32-bit VGPR to load and extract 16-bits into the final register.
+ ValueReg =
+ RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
+ SubReg = ValueReg;
+ IsKill = false;
+ }
+
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
MachineMemOperand *NewMMO =
MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
@@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore(
MIB.addImm(0); // swz
MIB.addMemOperand(NewMMO);
+ if (FinalValueReg != ValueReg) {
+ // Extract 16-bit from the loaded 32-bit value.
+ ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
+ MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
+ .addReg(FinalValueReg, getDefRegState(true))
+ .addImm(0)
+ .addReg(ValueReg, getKillRegState(true))
+ .addImm(0);
+ ValueReg = FinalValueReg;
+ }
+
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
@@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
- Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ Opc = ST.d16PreservesUnusedBits()
+ ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
+ : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
} else {
Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 313ae3d..a4d3d62 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
- if (!Subtarget->hasVFP2Base())
+ if (!Subtarget->hasVFP2Base()) {
setAllExpand(MVT::f32);
- if (!Subtarget->hasFP64())
+ } else {
+ for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+ ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+ setOperationAction(Op, MVT::f32, Legal);
+ }
+ if (!Subtarget->hasFP64()) {
setAllExpand(MVT::f64);
+ } else {
+ for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+ ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+ setOperationAction(Op, MVT::f64, Legal);
+ }
}
if (Subtarget->hasFullFP16()) {
@@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+ setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
+ setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
}
// fp16 is a special v7 extension that adds f16 <-> f32 conversions.
if (!Subtarget->hasFP16()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
+ setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
}
// Strict floating-point comparisons need custom lowering.
@@ -1298,12 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
}
- // Use __sincos_stret if available.
- if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
- getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
- }
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
// FP-ARMv8 implements a lot of rounding-like FP operations.
if (Subtarget->hasFPARMv8Base()) {
@@ -1337,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
}
// FP16 often need to be promoted to call lib functions
+ // clang-format off
if (Subtarget->hasFullFP16()) {
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FTAN, MVT::f16, Promote);
- setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
- setOperationAction(ISD::FPOWI, MVT::f16, Promote);
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FEXP, MVT::f16, Promote);
- setOperationAction(ISD::FEXP2, MVT::f16, Promote);
- setOperationAction(ISD::FEXP10, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::f16, Promote);
- setOperationAction(ISD::FLOG10, MVT::f16, Promote);
- setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::LRINT, MVT::f16, Expand);
setOperationAction(ISD::LROUND, MVT::f16, Expand);
-
- setOperationAction(ISD::FROUND, MVT::f16, Legal);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
- setOperationAction(ISD::FRINT, MVT::f16, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
- setOperationAction(ISD::FCEIL, MVT::f16, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+
+ for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
+ ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
+ ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
+ ISD::FASIN, ISD::FATAN, ISD::FATAN2,
+ ISD::FCOSH, ISD::FSINH, ISD::FTANH,
+ ISD::FTAN, ISD::FEXP, ISD::FEXP2,
+ ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
+ ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
+ ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN,
+ ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,
+ ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
+ ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,
+ ISD::STRICT_FTAN}) {
+ setOperationAction(Op, MVT::f16, Promote);
+ }
+
+ // Round-to-integer need custom lowering for fp16, as Promote doesn't work
+ // because the result type is integer.
+ for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT})
+ setOperationAction(Op, MVT::f16, Custom);
+
+ for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC,
+ ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR,
+ ISD::FCEIL, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN,
+ ISD::STRICT_FTRUNC, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT,
+ ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL}) {
+ setOperationAction(Op, MVT::f16, Legal);
+ }
+ // clang-format on
}
if (Subtarget->hasNEON()) {
@@ -9835,13 +9856,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
}
SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget->isTargetDarwin());
-
// For iOS, we want to call an alternative entry point: __sincos_stret,
// return values are passed via sret.
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
+ RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+ RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
+ if (SincosStret == RTLIB::Unsupported)
+ return SDValue();
+
+ assert(Subtarget->isTargetDarwin());
+
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -9871,11 +9897,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
Args.emplace_back(Arg, ArgTy);
- RTLIB::Libcall LC =
- (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = getLibcallName(LC);
- CallingConv::ID CC = getLibcallCallingConv(LC);
- SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+ StringRef LibcallName = getLibcallImplName(SincosStret);
+ CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
+ SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
@@ -10726,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerCMP(Op, DAG);
case ISD::ABS:
return LowerABS(Op, DAG);
+ case ISD::STRICT_LROUND:
+ case ISD::STRICT_LLROUND:
+ case ISD::STRICT_LRINT:
+ case ISD::STRICT_LLRINT: {
+ assert((Op.getOperand(1).getValueType() == MVT::f16 ||
+ Op.getOperand(1).getValueType() == MVT::bf16) &&
+ "Expected custom lowering of rounding operations only for f16");
+ SDLoc DL(Op);
+ SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
}
}
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 10d4cd5..f7176a6 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>;
// An 'fmul' node with a single use.
let HasOneUse = 1 in
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>;
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>;
// An 'fadd' node which checks for single non-hazardous use.
-def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{
return hasNoVMLxHazardUse(N);
}]>;
// An 'fsub' node which checks for single non-hazardous use.
-def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{
return hasNoVMLxHazardUse(N);
}]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 6771106..e2cc97b 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP
def VADDD : ADbI<0b11100, 0b11, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
- [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
+ [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPALU64]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VADDS : ASbIn<0b11100, 0b11, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
- [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>,
+ [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>,
Sched<[WriteFPALU32]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
def VADDH : AHbI<0b11100, 0b11, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
- [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+ [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSUBD : ADbI<0b11100, 0b11, 1, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
- [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
+ [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPALU64]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
- [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>,
+ [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>,
Sched<[WriteFPALU32]>{
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
def VSUBH : AHbI<0b11100, 0b11, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
- [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+ [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVD : ADbI<0b11101, 0b00, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
- [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
+ [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPDIV64]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVS : ASbI<0b11101, 0b00, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
- [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>,
+ [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>,
Sched<[WriteFPDIV32]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
- [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+ [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPDIV32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMULD : ADbI<0b11100, 0b10, 0, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
- [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
+ [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VMULS : ASbIn<0b11100, 0b10, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
- [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>,
+ [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
def VMULH : AHbI<0b11100, 0b10, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
- [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+ [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
- [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
+ [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VNMULS : ASbI<0b11100, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
- [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>,
+ [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
- [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
+ [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
multiclass vsel_inst<string op, bits<2> opc, int CC> {
@@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>;
defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
defm VSELVS : vsel_inst<"vs", 0b01, 6>;
-multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+multiclass vmaxmin_inst<string op, bit opc, PatFrags SD> {
let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
isUnpredicable = 1, mayRaiseFPException = 1 in {
def H : AHbInp<0b11101, 0b00, opc,
@@ -610,8 +610,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
}
}
-defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
-defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
+defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>;
+defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>;
// Match reassociated forms only if not sign dependent rounding.
def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "",
- [(set DPR:$Dd, (fpextend SPR:$Sm))]>,
+ [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>,
Sched<[WriteFPCVT]> {
// Instruction operands.
bits<5> Dd;
@@ -766,7 +766,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "",
- [(set SPR:$Sd, (fpround DPR:$Dm))]>,
+ [(set SPR:$Sd, (any_fpround DPR:$Dm))]>,
Sched<[WriteFPCVT]> {
// Instruction operands.
bits<5> Sd;
@@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))),
+def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))),
(VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>;
def : FP16Pat<(f16_to_fp GPR:$a),
(VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
@@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FP16Pat<(f16 (fpround SPR:$Sm)),
+def : FP16Pat<(f16 (any_fpround SPR:$Sm)),
(COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>;
def : FP16Pat<(fp_to_f16 SPR:$a),
(i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>;
-def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane),
(v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
(VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
-def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane),
(v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
(VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
SPR:$src2),
@@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
+def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
(VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>;
-def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
+def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
(VCVTTHS (EXTRACT_SUBREG
(v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
(SSubReg_f16_reg imm_odd:$lane)))>;
@@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane),
(v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
(VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
-def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane),
(v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
(VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
SPR:$src2),
@@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
let hasSideEffects = 0;
}
-def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))),
+def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))),
(VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>,
Requires<[HasFPARMv8, HasDPVFP]>;
def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
@@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
let hasSideEffects = 0;
}
-def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
+def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)),
(COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>,
Requires<[HasFPARMv8, HasDPVFP]>;
def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
@@ -1007,41 +1007,41 @@ multiclass vcvt_inst<string opc, bits<2> rm,
let Predicates = [HasFPARMv8] in {
let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))),
+ def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SH") (f16 HPR:$a)),
GPR)>;
- def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))),
+ def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"UH") (f16 HPR:$a)),
GPR)>;
}
- def : Pat<(i32 (fp_to_sint (node SPR:$a))),
+ def : Pat<(i32 (any_fp_to_sint (node SPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SS") SPR:$a),
GPR)>;
- def : Pat<(i32 (fp_to_uint (node SPR:$a))),
+ def : Pat<(i32 (any_fp_to_uint (node SPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"US") SPR:$a),
GPR)>;
}
let Predicates = [HasFPARMv8, HasDPVFP] in {
- def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))),
+ def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SD") DPR:$a),
GPR)>;
- def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))),
+ def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"UD") DPR:$a),
GPR)>;
}
}
-defm VCVTA : vcvt_inst<"a", 0b00, fround>;
+defm VCVTA : vcvt_inst<"a", 0b00, any_fround>;
defm VCVTN : vcvt_inst<"n", 0b01>;
-defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
-defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
+defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>;
+defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>;
def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
(outs DPR:$Dd), (ins DPR:$Dm),
@@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node,
Requires<[HasFPARMv8,HasDPVFP]>;
}
-defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>;
-defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>;
-defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>;
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>;
multiclass vrint_inst_anpm<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
@@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
Requires<[HasFPARMv8,HasDPVFP]>;
}
-defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
-defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>;
-defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
-defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>;
+
let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs DPR:$Dd), (ins DPR:$Dm),
IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "",
- [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
+ [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>,
Sched<[WriteFPSQRT64]>;
let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "",
- [(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
+ [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>,
Sched<[WriteFPSQRT32]>;
let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
- [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>;
+ [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>;
let hasSideEffects = 0 in {
let isMoveReg = 1 in {
@@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
}
let Predicates=[HasVFP2, HasDPVFP] in {
- def : VFPPat<(f64 (sint_to_fp GPR:$a)),
+ def : VFPPat<(f64 (any_sint_to_fp GPR:$a)),
(VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
- def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOD (VLDRS addrmode5:$a))>;
}
@@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
let D = VFPNeonA8Domain;
}
-def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)),
(VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOS (VLDRS addrmode5:$a))>;
let mayRaiseFPException = 1 in
@@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)),
(VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
let mayRaiseFPException = 1 in
@@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
}
let Predicates=[HasVFP2, HasDPVFP] in {
- def : VFPPat<(f64 (uint_to_fp GPR:$a)),
+ def : VFPPat<(f64 (any_uint_to_fp GPR:$a)),
(VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
- def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+ def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOD (VLDRS addrmode5:$a))>;
}
@@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
let D = VFPNeonA8Domain;
}
-def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)),
(VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOS (VLDRS addrmode5:$a))>;
let mayRaiseFPException = 1 in
@@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)),
(VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
// FP -> Int:
@@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
}
let Predicates=[HasVFP2, HasDPVFP] in {
- def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
+ def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))),
(COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)),
(COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
- def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+ def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
(VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
(VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
@@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
let D = VFPNeonA8Domain;
}
-def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
+def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)),
(COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)),
(COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
-def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))),
addrmode5:$ptr),
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)),
@@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
+def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))),
(COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)),
(COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
@@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
}
let Predicates=[HasVFP2, HasDPVFP] in {
- def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
+ def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))),
(COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)),
(COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
- def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+ def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
(VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
(VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
@@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
let D = VFPNeonA8Domain;
}
-def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
+def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)),
(COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)),
(COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
-def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))),
addrmode5:$ptr),
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)),
@@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))),
+def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))),
(COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)),
(COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
@@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
// Match @llvm.fma.* intrinsics
// (fma x, y, z) -> (vfms z, x, y)
-def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
(VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
(VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
+def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
(VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
@@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
// Match @llvm.fma.* intrinsics
// (fma (fneg x), y, z) -> (vfms z, x, y)
-def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
(VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
(VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
+def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
(VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
@@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
// Match @llvm.fma.* intrinsics
// (fneg (fma x, y, z)) -> (vfnma z, x, y)
-def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
(VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
+def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
(VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
-def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
(VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
(VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
@@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
// Match @llvm.fma.* intrinsics
// (fma x, y, (fneg z)) -> (vfnms z, x, y))
-def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
(VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
(VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
-def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
(VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
+def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
(VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 44c4830..7ae500a 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>,
IntrinArgI8<SignedOpKind_Unsigned>
]>,
+ IntrinSelect<int_dx_wave_reduce_min,
+ [
+ IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+ IntrinArgI8<SignedOpKind_Signed>
+ ]>,
+ IntrinSelect<int_dx_wave_reduce_umin,
+ [
+ IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+ IntrinArgI8<SignedOpKind_Unsigned>
+ ]>,
];
let arguments = [OverloadTy, Int8Ty, Int8Ty];
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index e46a393..8720460 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -904,6 +904,8 @@ public:
case Intrinsic::dx_resource_casthandle:
// NOTE: llvm.dbg.value is supported as is in DXIL.
case Intrinsic::dbg_value:
+ // NOTE: llvm.assume is supported as is in DXIL.
+ case Intrinsic::assume:
case Intrinsic::not_intrinsic:
if (F.use_empty())
F.eraseFromParent();
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index e7e7f2c..ce6e812 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) {
case Intrinsic::dx_wave_reduce_usum:
case Intrinsic::dx_wave_reduce_max:
case Intrinsic::dx_wave_reduce_umax:
+ case Intrinsic::dx_wave_reduce_min:
+ case Intrinsic::dx_wave_reduce_umin:
return true;
}
}
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 1e4797b..cf8b833 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -36,9 +36,10 @@ using namespace llvm;
using namespace llvm::dxil;
namespace {
-/// A simple Wrapper DiagnosticInfo that generates Module-level diagnostic
-/// for TranslateMetadata pass
-class DiagnosticInfoTranslateMD : public DiagnosticInfo {
+
+/// A simple wrapper of DiagnosticInfo that generates module-level diagnostic
+/// for the DXILValidateMetadata pass
+class DiagnosticInfoValidateMD : public DiagnosticInfo {
private:
const Twine &Msg;
const Module &Mod;
@@ -47,9 +48,9 @@ public:
/// \p M is the module for which the diagnostic is being emitted. \p Msg is
/// the message to show. Note that this class does not copy this message, so
/// this reference must be valid for the whole life time of the diagnostic.
- DiagnosticInfoTranslateMD(const Module &M,
- const Twine &Msg LLVM_LIFETIME_BOUND,
- DiagnosticSeverity Severity = DS_Error)
+ DiagnosticInfoValidateMD(const Module &M,
+ const Twine &Msg LLVM_LIFETIME_BOUND,
+ DiagnosticSeverity Severity = DS_Error)
: DiagnosticInfo(DK_Unsupported, Severity), Msg(Msg), Mod(M) {}
void print(DiagnosticPrinter &DP) const override {
@@ -57,6 +58,16 @@ public:
}
};
+static void reportError(Module &M, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ M.getContext().diagnose(DiagnosticInfoValidateMD(M, Message, Severity));
+}
+
+static void reportLoopError(Module &M, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ reportError(M, Twine("Invalid \"llvm.loop\" metadata: ") + Message, Severity);
+}
+
enum class EntryPropsTag {
ShaderFlags = 0,
GSState,
@@ -314,25 +325,122 @@ static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) {
BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
}
-static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
+// Determines if the metadata node will be compatible with DXIL's loop metadata
+// representation.
+//
+// Reports an error for compatible metadata that is ill-formed.
+static bool isLoopMDCompatible(Module &M, Metadata *MD) {
+ // DXIL only accepts the following loop hints:
+ std::array<StringLiteral, 3> ValidHintNames = {"llvm.loop.unroll.count",
+ "llvm.loop.unroll.disable",
+ "llvm.loop.unroll.full"};
+
+ MDNode *HintMD = dyn_cast<MDNode>(MD);
+ if (!HintMD || HintMD->getNumOperands() == 0)
+ return false;
+
+ auto *HintStr = dyn_cast<MDString>(HintMD->getOperand(0));
+ if (!HintStr)
+ return false;
+
+ if (!llvm::is_contained(ValidHintNames, HintStr->getString()))
+ return false;
+
+ auto ValidCountNode = [](MDNode *CountMD) -> bool {
+ if (CountMD->getNumOperands() == 2)
+ if (auto *Count = dyn_cast<ConstantAsMetadata>(CountMD->getOperand(1)))
+ if (isa<ConstantInt>(Count->getValue()))
+ return true;
+ return false;
+ };
+
+ if (HintStr->getString() == "llvm.loop.unroll.count") {
+ if (!ValidCountNode(HintMD)) {
+ reportLoopError(M, "\"llvm.loop.unroll.count\" must have 2 operands and "
+ "the second must be a constant integer");
+ return false;
+ }
+ } else if (HintMD->getNumOperands() != 1) {
+ reportLoopError(
+ M, "\"llvm.loop.unroll.disable\" and \"llvm.loop.unroll.full\" "
+ "must be provided as a single operand");
+ return false;
+ }
+
+ return true;
+}
+
+static void translateLoopMetadata(Module &M, Instruction *I, MDNode *BaseMD) {
+ // A distinct node has the self-referential form: !0 = !{ !0, ... }
+ auto IsDistinctNode = [](MDNode *Node) -> bool {
+ return Node && Node->getNumOperands() != 0 && Node == Node->getOperand(0);
+ };
+
+ // Set metadata to null to remove empty/ill-formed metadata from instruction
+ if (BaseMD->getNumOperands() == 0 || !IsDistinctNode(BaseMD))
+ return I->setMetadata("llvm.loop", nullptr);
+
+ // It is valid to have a chain of self-refential loop metadata nodes, as
+ // below. We will collapse these into just one when we reconstruct the
+ // metadata.
+ //
+ // Eg:
+ // !0 = !{!0, !1}
+ // !1 = !{!1, !2}
+ // !2 = !{!"llvm.loop.unroll.disable"}
+ //
+ // So, traverse down a potential self-referential chain
+ while (1 < BaseMD->getNumOperands() &&
+ IsDistinctNode(dyn_cast<MDNode>(BaseMD->getOperand(1))))
+ BaseMD = dyn_cast<MDNode>(BaseMD->getOperand(1));
+
+ // To reconstruct a distinct node we create a temporary node that we will
+ // then update to create a self-reference.
+ llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(M.getContext(), {});
+ SmallVector<Metadata *> CompatibleOperands = {TempNode.get()};
+
+ // Iterate and reconstruct the metadata nodes that contains any hints,
+ // stripping any unrecognized metadata.
+ ArrayRef<MDOperand> Operands = BaseMD->operands();
+ for (auto &Op : Operands.drop_front())
+ if (isLoopMDCompatible(M, Op.get()))
+ CompatibleOperands.push_back(Op.get());
+
+ if (2 < CompatibleOperands.size())
+ reportLoopError(M, "Provided conflicting hints");
+
+ MDNode *CompatibleLoopMD = MDNode::get(M.getContext(), CompatibleOperands);
+ TempNode->replaceAllUsesWith(CompatibleLoopMD);
+
+ I->setMetadata("llvm.loop", CompatibleLoopMD);
+}
+
+using InstructionMDList = std::array<unsigned, 7>;
+
+static InstructionMDList getCompatibleInstructionMDs(llvm::Module &M) {
return {
M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"),
M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range,
- llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias};
+ llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias,
+ M.getMDKindID("llvm.loop")};
}
static void translateInstructionMetadata(Module &M) {
// construct allowlist of valid metadata node kinds
- std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+ InstructionMDList DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+ unsigned char MDLoopKind = M.getContext().getMDKindID("llvm.loop");
for (Function &F : M) {
for (BasicBlock &BB : F) {
// This needs to be done first so that "hlsl.controlflow.hints" isn't
- // removed in the whitelist below
+ // removed in the allow-list below
if (auto *I = BB.getTerminator())
translateBranchMetadata(M, I);
for (auto &I : make_early_inc_range(BB)) {
+ if (isa<BranchInst>(I))
+ if (MDNode *LoopMD = I.getMetadata(MDLoopKind))
+ translateLoopMetadata(M, &I, LoopMD);
I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
}
}
@@ -364,6 +472,16 @@ static void cleanModuleFlags(Module &M) {
M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
}
+using GlobalMDList = std::array<StringLiteral, 7>;
+
+// The following are compatible with DXIL but not emit with clang, they can
+// be added when applicable:
+// dx.typeAnnotations, dx.viewIDState, dx.dxrPayloadAnnotations
+static GlobalMDList CompatibleNamedModuleMDs = {
+ "llvm.ident", "llvm.module.flags", "dx.resources", "dx.valver",
+ "dx.shaderModel", "dx.version", "dx.entryPoints",
+};
+
static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
DXILResourceTypeMap &DRTM,
const ModuleShaderFlags &ShaderFlags,
@@ -389,31 +507,23 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
uint64_t CombinedMask = ShaderFlags.getCombinedFlags();
EntryFnMDNodes.emplace_back(
emitTopLevelLibraryNode(M, ResourceMD, CombinedMask));
- } else if (MMDI.EntryPropertyVec.size() > 1) {
- M.getContext().diagnose(DiagnosticInfoTranslateMD(
- M, "Non-library shader: One and only one entry expected"));
- }
+ } else if (1 < MMDI.EntryPropertyVec.size())
+ reportError(M, "Non-library shader: One and only one entry expected");
for (const EntryProperties &EntryProp : MMDI.EntryPropertyVec) {
- const ComputedShaderFlags &EntrySFMask =
- ShaderFlags.getFunctionFlags(EntryProp.Entry);
-
- // If ShaderProfile is Library, mask is already consolidated in the
- // top-level library node. Hence it is not emitted.
uint64_t EntryShaderFlags = 0;
if (MMDI.ShaderProfile != Triple::EnvironmentType::Library) {
- EntryShaderFlags = EntrySFMask;
- if (EntryProp.ShaderStage != MMDI.ShaderProfile) {
- M.getContext().diagnose(DiagnosticInfoTranslateMD(
- M,
- "Shader stage '" +
- Twine(getShortShaderStage(EntryProp.ShaderStage) +
- "' for entry '" + Twine(EntryProp.Entry->getName()) +
- "' different from specified target profile '" +
- Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
- "'"))));
- }
+ EntryShaderFlags = ShaderFlags.getFunctionFlags(EntryProp.Entry);
+ if (EntryProp.ShaderStage != MMDI.ShaderProfile)
+ reportError(
+ M, "Shader stage '" +
+ Twine(getShortShaderStage(EntryProp.ShaderStage)) +
+ "' for entry '" + Twine(EntryProp.Entry->getName()) +
+ "' different from specified target profile '" +
+ Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
+ "'"));
}
+
EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
EntryShaderFlags,
MMDI.ShaderProfile));
@@ -426,19 +536,17 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
cleanModuleFlags(M);
- // dx.rootsignatures will have been parsed from its metadata form as its
- // binary form as part of the RootSignatureAnalysisWrapper, so safely
- // remove it as it is not recognized in DXIL
- if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
- RootSignature->eraseFromParent();
+ // Finally, strip all module metadata that is not explicitly specified in the
+ // allow-list
+ SmallVector<NamedMDNode *> ToStrip;
- // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
- // causes all tests using the DXIL Validator to fail.
- //
- // This is a temporary fix and should be replaced with a allowlist once
- // we have determined all metadata that the DXIL Validator allows
- if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
- ErrNo->eraseFromParent();
+ for (NamedMDNode &NamedMD : M.named_metadata())
+ if (!NamedMD.getName().starts_with("llvm.dbg.") &&
+ !llvm::is_contained(CompatibleNamedModuleMDs, NamedMD.getName()))
+ ToStrip.push_back(&NamedMD);
+
+ for (NamedMDNode *NamedMD : ToStrip)
+ NamedMD->eraseFromParent();
}
PreservedAnalyses DXILTranslateMetadata::run(Module &M,
@@ -454,45 +562,34 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
return PreservedAnalyses::all();
}
-namespace {
-class DXILTranslateMetadataLegacy : public ModulePass {
-public:
- static char ID; // Pass identification, replacement for typeid
- explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {}
-
- StringRef getPassName() const override { return "DXIL Translate Metadata"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DXILResourceTypeWrapperPass>();
- AU.addRequired<DXILResourceWrapperPass>();
- AU.addRequired<ShaderFlagsAnalysisWrapper>();
- AU.addRequired<DXILMetadataAnalysisWrapperPass>();
- AU.addRequired<RootSignatureAnalysisWrapper>();
-
- AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
- AU.addPreserved<DXILResourceBindingWrapperPass>();
- AU.addPreserved<DXILResourceWrapperPass>();
- AU.addPreserved<RootSignatureAnalysisWrapper>();
- AU.addPreserved<ShaderFlagsAnalysisWrapper>();
- }
+void DXILTranslateMetadataLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<DXILResourceTypeWrapperPass>();
+ AU.addRequired<DXILResourceWrapperPass>();
+ AU.addRequired<ShaderFlagsAnalysisWrapper>();
+ AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+ AU.addRequired<RootSignatureAnalysisWrapper>();
+
+ AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
+ AU.addPreserved<DXILResourceBindingWrapperPass>();
+ AU.addPreserved<DXILResourceWrapperPass>();
+ AU.addPreserved<RootSignatureAnalysisWrapper>();
+ AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+}
- bool runOnModule(Module &M) override {
- DXILResourceMap &DRM =
- getAnalysis<DXILResourceWrapperPass>().getResourceMap();
- DXILResourceTypeMap &DRTM =
- getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
- const ModuleShaderFlags &ShaderFlags =
- getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags();
- dxil::ModuleMetadataInfo MMDI =
- getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
-
- translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
- translateInstructionMetadata(M);
- return true;
- }
-};
+bool DXILTranslateMetadataLegacy::runOnModule(Module &M) {
+ DXILResourceMap &DRM =
+ getAnalysis<DXILResourceWrapperPass>().getResourceMap();
+ DXILResourceTypeMap &DRTM =
+ getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
+ const ModuleShaderFlags &ShaderFlags =
+ getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags();
+ dxil::ModuleMetadataInfo MMDI =
+ getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
-} // namespace
+ translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+ translateInstructionMetadata(M);
+ return true;
+}
char DXILTranslateMetadataLegacy::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
index 4c1ffac..cfb8aaa 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
@@ -10,6 +10,7 @@
#define LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H
#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
namespace llvm {
@@ -20,6 +21,22 @@ public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
};
+/// Wrapper pass for the legacy pass manager.
+///
+/// This is required because the passes that will depend on this are codegen
+/// passes which run through the legacy pass manager.
+class DXILTranslateMetadataLegacy : public ModulePass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {}
+
+ StringRef getPassName() const override { return "DXIL Translate Metadata"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool runOnModule(Module &M) override;
+};
+
} // namespace llvm
#endif // LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 68fd3e0..60dfd96 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
case Intrinsic::dx_splitdouble:
case Intrinsic::dx_wave_readlane:
case Intrinsic::dx_wave_reduce_max:
+ case Intrinsic::dx_wave_reduce_min:
case Intrinsic::dx_wave_reduce_sum:
case Intrinsic::dx_wave_reduce_umax:
+ case Intrinsic::dx_wave_reduce_umin:
case Intrinsic::dx_wave_reduce_usum:
case Intrinsic::dx_imad:
case Intrinsic::dx_umad:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 54c8972..0573f64 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
SDValue W0 = isUndef(PredV)
? DAG.getUNDEF(MVT::i64)
: DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
- Words[IdxW].push_back(HiHalf(W0, DAG));
- Words[IdxW].push_back(LoHalf(W0, DAG));
+ if (Bytes < BitBytes) {
+ Words[IdxW].push_back(HiHalf(W0, DAG));
+ Words[IdxW].push_back(LoHalf(W0, DAG));
+ } else
+ Words[IdxW].push_back(W0);
while (Bytes < BitBytes) {
IdxW ^= 1;
@@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
Bytes *= 2;
}
+ while (Bytes > BitBytes) {
+ IdxW ^= 1;
+ Words[IdxW].clear();
+
+ if (Bytes <= 4) {
+ for (const SDValue &W : Words[IdxW ^ 1]) {
+ SDValue T = contractPredicate(W, dl, DAG);
+ Words[IdxW].push_back(T);
+ }
+ } else {
+ for (const SDValue &W : Words[IdxW ^ 1]) {
+ Words[IdxW].push_back(W);
+ }
+ }
+ Bytes /= 2;
+ }
+
assert(Bytes == BitBytes);
+ if (BitBytes == 1 && PredTy == MVT::v2i1)
+ ByteTy = MVT::getVectorVT(MVT::i16, HwLen);
SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
@@ -3157,6 +3179,9 @@ SDValue
HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
auto *MemN = cast<MemSDNode>(Op.getNode());
+ if (!MemN->getMemoryVT().isSimple())
+ return Op;
+
MVT MemTy = MemN->getMemoryVT().getSimpleVT();
if (!isHvxPairTy(MemTy))
return Op;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 4029e14..729c077 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U,
// predicate ("@").
return !AsmInst.empty() &&
(AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
- AsmInst.find(".pragma") != StringRef::npos);
+ AsmInst.contains(".pragma"));
});
return InstCount * TargetTransformInfo::TCC_Basic;
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 17f04d0..20fc849 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
"ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
cl::desc("Set minimum number of entries to use a jump table on PPC"));
+static cl::opt<unsigned> PPCMinimumBitTestCmps(
+ "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
+ cl::desc("Set minimum of largest number of comparisons to use bit test for "
+ "switch on PPC."));
+
static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
"ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// Re-evaluate this value on future HWs that can do better with mtctr.
setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
+ // The default minimum of largest number in a BitTest cluster is 3.
+ setMinimumBitTestCmps(PPCMinimumBitTestCmps);
+
setMinFunctionAlignment(Align(4));
setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index b0bed71c..da3efdc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = XT{5};
}
+class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<opCode, OOL, IOL, asmstr, itin> {
+
+ bits<5> RB;
+ bits<5> RS;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = RS;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
+ let Inst{31} = 0;
+}
+
class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
list<dag> pattern>
: I<59, OOL, IOL, asmstr, NoItinerary> {
@@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in {
def TLBIEIO
: XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC),
"tlbieio $RB, $RS, $RIC", []>;
+ def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS),
+ "mtlpl $RB, $RS", IIC_SprMTSPR, []>;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
def TLBIEP8
: XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
(ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
u1imm:$PRS, u1imm:$R),
"tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+ def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS),
+ "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64;
}
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 41a9c92..96e8afc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
break;
case RISCV::fixup_riscv_rvc_jump:
case RISCV::fixup_riscv_rvc_branch:
+ case RISCV::fixup_riscv_rvc_imm:
case RISCV::fixup_riscv_jal:
return false;
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 6d587e6..5934c91 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
// the `jal` again in the assembler.
} else if (MIFrm == RISCVII::InstFormatCI) {
FixupKind = RISCV::fixup_riscv_rvc_imm;
+ AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili);
} else if (MIFrm == RISCVII::InstFormatI) {
FixupKind = RISCV::fixup_riscv_12_i;
} else if (MIFrm == RISCVII::InstFormatQC_EB) {
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 98b636e..9bd66a4 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
.addReg(ScratchReg)
.addImm(-1);
break;
+ case AtomicRMWInst::Max:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::Min:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::UMax:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::UMin:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
}
BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg)
.addReg(ScratchReg)
@@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
MachineBasicBlock::iterator &NextMBBI) {
+ // Using MIN(U)/MAX(U) is preferrable if permitted
+ if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked)
+ return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI);
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 2754d78..b4556f6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature<
def HasAtomicLdSt
: Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
+// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508)
+// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf]
+// in section 13.3. Eventual Success of Store-Conditional Instructions, defines
+// _constrained_ LR/SC loops:
+// The dynamic code executed between the LR and SC instructions can only
+// contain instructions from the base ''I'' instruction set, excluding loads,
+// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM
+// instructions. Compressed forms of the aforementioned ''I'' instructions in
+// the Zca and Zcb extensions are also permitted.
+// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops,
+// and success is implementation specific. For implementations which know that
+// non-base instructions (such as the ''B'' extension) will not violate any
+// forward progress guarantees, using these instructions to reduce the LR/SC
+// sequence length is desirable.
+def FeaturePermissiveZalrsc
+ : SubtargetFeature<
+ "permissive-zalrsc", "HasPermissiveZalrsc", "true",
+ "Implementation permits non-base instructions between LR/SC pairs">;
+
def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
"AllowTaggedGlobals",
"true", "Use an instruction sequence for taking the address of a global "
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 6181abb..47022b3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
.addDef(ResVReg)
.addUse(getSPIRVTypeID(BaseType))
.addImm(static_cast<uint32_t>(Storage));
- if (Init != 0)
+ if (Init)
MIB.addUse(Init->getOperand(0).getReg());
// ISel may introduce a new register on this step, so we need to add it to
// DT and correct its type avoiding fails on the next stage.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 021353a..3fea21e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -222,6 +222,9 @@ private:
bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I, bool IsUnsigned) const;
+ bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I, bool IsUnsigned) const;
+
bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
@@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg,
.constrainAllUses(TII, TRI, RBI);
}
+bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I,
+ bool IsUnsigned) const {
+ assert(I.getNumOperands() == 3);
+ assert(I.getOperand(2).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+ Register InputRegister = I.getOperand(2).getReg();
+ SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister);
+
+ if (!InputType)
+ report_fatal_error("Input Type could not be determined.");
+
+ SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+ // Retreive the operation to use based on input type
+ bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat);
+ auto IntegerOpcodeType =
+ IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin;
+ auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType;
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII,
+ !STI.isShader()))
+ .addImm(SPIRV::GroupOperation::Reduce)
+ .addUse(I.getOperand(2).getReg())
+ .constrainAllUses(TII, TRI, RBI);
+}
+
bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
@@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true);
case Intrinsic::spv_wave_reduce_max:
return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false);
+ case Intrinsic::spv_wave_reduce_umin:
+ return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true);
+ case Intrinsic::spv_wave_reduce_min:
+ return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false);
case Intrinsic::spv_wave_reduce_sum:
return selectWaveReduceSum(ResVReg, ResType, I);
case Intrinsic::spv_wave_readlane:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3da720f..58109ac 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
if (CB->isInlineAsm()) {
const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
- return IA &&
- IA->getConstraintString().find("{@cc}") != std::string::npos;
+ return IA && IA->getConstraintString().contains("{@cc}");
}
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20e..624cff2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Combine sin / cos into _sincos_stret if it is available.
- if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
- getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
- }
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -12216,7 +12213,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
: MVT::getVectorVT(ShiftSVT, Size / Scale);
- return (int)ShiftAmt;
+ return ShiftAmt;
};
// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
@@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ bool isF64 = ArgVT == MVT::f64;
+
+ RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+ const char *LibcallName = TLI.getLibcallName(LC);
+ if (!LibcallName)
+ return SDValue();
+
assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
// which returns the values as { float, float } (in XMM0) or
// { double, double } (which is returned in XMM0, XMM1).
SDLoc dl(Op);
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
Args.emplace_back(Arg, ArgTy);
- bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
// the small struct {f32, f32} is returned in (eax, edx). For f64,
// the results are returned via SRet in memory.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = TLI.getLibcallName(LC);
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
@@ -48786,6 +48787,11 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
}
if (CC == X86::COND_E || CC == X86::COND_NE) {
+ // Canonicalize constant to RHS if we're just using ZF.
+ if (Op0 != Op1 && DAG.isConstantIntBuildVectorOrConstantInt(Op0) &&
+ !DAG.isConstantIntBuildVectorOrConstantInt(Op1))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op0);
+
// TESTZ(X,~Y) == TESTC(Y,X)
if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
@@ -48849,10 +48855,6 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
}
}
- // TESTZ(-1,X) == TESTZ(X,X)
- if (ISD::isBuildVectorAllOnes(Op0.getNode()))
- return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
-
// TESTZ(X,-1) == TESTZ(X,X)
if (ISD::isBuildVectorAllOnes(Op1.getNode()))
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
@@ -54634,6 +54636,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
SDLoc DL(N);
// Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54655,40 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
return V;
+ // Fold trunc(srl(load(p),amt)) -> load(p+amt/8)
+ // If we're shifting down byte aligned bit chunks from a larger load for
+ // truncation, see if we can convert the shift into a pointer offset instead.
+ // Limit this to normal (non-ext) scalar integer loads.
+ if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+ auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+ if (Ld->isSimple() && VT.isByteSized() &&
+ isPowerOf2_64(VT.getSizeInBits())) {
+ SDValue ShAmt = Src.getOperand(1);
+ KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+ // Check the shift amount is byte aligned.
+ // Check the truncation doesn't use any shifted in (zero) top bits.
+ if (KnownAmt.countMinTrailingZeros() >= 3 &&
+ KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+ VT.getSizeInBits())) {
+ EVT PtrVT = Ld->getBasePtr().getValueType();
+ SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+ SDValue PtrByteOfs =
+ DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+ DAG.getShiftAmountConstant(3, PtrVT, DL));
+ SDValue NewPtr = DAG.getMemBasePlusOffset(
+ Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+ Align(), Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+ NewLoad.getValue(1));
+ return NewLoad;
+ }
+ }
+ }
+
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {