diff options
Diffstat (limited to 'llvm/lib/Target')
42 files changed, 6621 insertions, 459 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 8d0ff41..1396841 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); -FunctionPass *createMachineSMEABIPass(); +FunctionPass *createMachineSMEABIPass(CodeGenOptLevel); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index a4529a5..0f457c2 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -133,6 +133,8 @@ include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseN3.td" include "AArch64SchedNeoverseV1.td" include "AArch64SchedNeoverseV2.td" +include "AArch64SchedNeoverseV3.td" +include "AArch64SchedNeoverseV3AE.td" include "AArch64SchedOryon.td" include "AArch64Processors.td" diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index b3ec65c..2783147 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { + combine_mul_cmlt, combine_use_vector_truncate, + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, + funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d16b116..60aa61e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); // SME Streaming functions are not eligible for TCO as they may require - // the streaming mode or ZA to be restored after returning from the call. + // the streaming mode or ZA/ZT0 to be restored after returning from the call. SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || + CallAttrs.requiresPreservingZT0() || CallAttrs.caller().hasStreamingBody()) return false; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 457e540..ccc8eb8 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -122,7 +122,7 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { NumBytes = Desc.getSize() ? Desc.getSize() : 4; const auto *MFI = MF->getInfo<AArch64FunctionInfo>(); - if (!MFI->shouldSignReturnAddress(MF)) + if (!MFI->shouldSignReturnAddress(*MF)) return NumBytes; const auto &STI = MF->getSubtarget<AArch64Subtarget>(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b9e299e..2871a20 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1805,14 +1805,22 @@ def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>; def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>; def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>; -class EOR3_pattern<ValueType VecTy> - : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)), - (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>; - -def : EOR3_pattern<v16i8>; -def : EOR3_pattern<v8i16>; -def : EOR3_pattern<v4i32>; -def : EOR3_pattern<v2i64>; +multiclass EOR3_pattern<ValueType Vec128Ty, ValueType Vec64Ty>{ + def : Pat<(xor (xor (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm)), (Vec128Ty V128:$Va)), + (EOR3 (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm), (Vec128Ty V128:$Va))>; + def : Pat<(xor (xor (Vec64Ty V64:$Vn), (Vec64Ty V64:$Vm)), (Vec64Ty V64:$Va)), + (EXTRACT_SUBREG + (EOR3 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vm, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Va, dsub)), + dsub)>; +} + +defm : EOR3_pattern<v16i8, v8i8>; +defm : EOR3_pattern<v8i16, v4i16>; +defm : EOR3_pattern<v4i32, v2i32>; +defm : EOR3_pattern<v2i64, v1i64>; class BCAX_pattern<ValueType VecTy> : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))), diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 81f5d07..11387bb 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1272,11 +1272,11 @@ def : ProcessorModel<"cortex-x2", NeoverseV2Model, ProcessorFeatures.X2, [TuneX2]>; def : ProcessorModel<"cortex-x3", NeoverseV2Model, ProcessorFeatures.X3, [TuneX3]>; -def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4, +def : ProcessorModel<"cortex-x4", NeoverseV3Model, ProcessorFeatures.X4, [TuneX4]>; -def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925, +def : ProcessorModel<"cortex-x925", NeoverseV3Model, ProcessorFeatures.X925, [TuneX925]>; -def : ProcessorModel<"gb10", NeoverseV2Model, ProcessorFeatures.GB10, +def : ProcessorModel<"gb10", NeoverseV3Model, ProcessorFeatures.GB10, [TuneX925]>; def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace, [TuneNeoverseV2]>; @@ -1295,9 +1295,9 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"neoverse-v2", NeoverseV2Model, ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>; -def : ProcessorModel<"neoverse-v3", NeoverseV2Model, +def : ProcessorModel<"neoverse-v3", NeoverseV3Model, ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>; -def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model, +def : ProcessorModel<"neoverse-v3ae", NeoverseV3AEModel, ProcessorFeatures.NeoverseV3AE, [TuneNeoverseV3AE]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3, [TuneExynosM3]>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td new file mode 100644 index 0000000..e23576a --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td @@ -0,0 +1,2777 @@ +//=- AArch64SchedNeoverseV3.td - NeoverseV3 Scheduling Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Arm Neoverse V3 processors. +// All information is taken from the V3 Software Optimization guide: +// +// https://developer.arm.com/documentation/109678/300/?lang=en +// +//===----------------------------------------------------------------------===// + +def NeoverseV3Model : SchedMachineModel { + let IssueWidth = 10; // Expect best value to be slightly higher than V2 + let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2 + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2. + let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, + [HasSVE2p1, HasSVEB16B16, + HasCPA, HasCSSC]); +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Neoverse V3. +// Instructions are first fetched and then decoded into internal macro-ops +// (MOPs). From there, the MOPs proceed through register renaming and dispatch +// stages. A MOP can be split into two micro-ops further down the pipeline +// after the decode stage. Once dispatched, micro-ops wait for their operands +// and issue out-of-order to one of twenty-one issue pipelines. Each issue +// pipeline can accept one micro-op per cycle. + +let SchedModel = NeoverseV3Model in { + +// Define the (21) issue ports. +def V3UnitB : ProcResource<3>; // Branch 0/1/2 +def V3UnitS0 : ProcResource<1>; // Integer single-cycle 0 +def V3UnitS1 : ProcResource<1>; // Integer single-cycle 1 +def V3UnitS2 : ProcResource<1>; // Integer single-cycle 2 +def V3UnitS3 : ProcResource<1>; // Integer single-cycle 3 +def V3UnitS4 : ProcResource<1>; // Integer single-cycle 4 +def V3UnitS5 : ProcResource<1>; // Integer single-cycle 5 +def V3UnitM0 : ProcResource<1>; // Integer single/multicycle 0 +def V3UnitM1 : ProcResource<1>; // Integer single/multicycle 1 +def V3UnitV0 : ProcResource<1>; // FP/ASIMD 0 +def V3UnitV1 : ProcResource<1>; // FP/ASIMD 1 +def V3UnitV2 : ProcResource<1>; // FP/ASIMD 2 +def V3UnitV3 : ProcResource<1>; // FP/ASIMD 3 +def V3UnitLS0 : ProcResource<1>; // Load/Store 0 +def V3UnitL12 : ProcResource<2>; // Load 1/2 +def V3UnitST1 : ProcResource<1>; // Store 1 +def V3UnitD : ProcResource<2>; // Store data 0/1 +def V3UnitFlg : ProcResource<4>; // Flags + +def V3UnitS : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5]>; // Integer single-cycle 0/1/2/3/4/5 +def V3UnitI : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5, V3UnitM0, V3UnitM1]>; // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1 +def V3UnitM : ProcResGroup<[V3UnitM0, V3UnitM1]>; // Integer single/multicycle 0/1 +def V3UnitLSA : ProcResGroup<[V3UnitLS0, V3UnitL12, V3UnitST1]>; // Supergroup of L+SA +def V3UnitL : ProcResGroup<[V3UnitLS0, V3UnitL12]>; // Load/Store 0 and Load 1/2 +def V3UnitSA : ProcResGroup<[V3UnitLS0, V3UnitST1]>; // Load/Store 0 and Store 1 +def V3UnitV : ProcResGroup<[V3UnitV0, V3UnitV1, V3UnitV2, V3UnitV3]>; // FP/ASIMD 0/1/2/3 +def V3UnitV01 : ProcResGroup<[V3UnitV0, V3UnitV1]>; // FP/ASIMD 0/1 +def V3UnitV02 : ProcResGroup<[V3UnitV0, V3UnitV2]>; // FP/ASIMD 0/2 +def V3UnitV13 : ProcResGroup<[V3UnitV1, V3UnitV3]>; // FP/ASIMD 1/3 + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadVLD, 0>; + +// NOTE: Copied from N2. +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Neoverse V3. + +//===----------------------------------------------------------------------===// + +// Define generic 0 micro-op types +def V3Write_0c : SchedWriteRes<[]> { let Latency = 0; } + +// Define generic 1 micro-op types + +def V3Write_1c_1B : SchedWriteRes<[V3UnitB]> { let Latency = 1; } +def V3Write_1c_1F_1Flg : SchedWriteRes<[V3UnitI, V3UnitFlg]> { let Latency = 1; } +def V3Write_1c_1I : SchedWriteRes<[V3UnitI]> { let Latency = 1; } +def V3Write_1c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 1; } +def V3Write_1c_1SA : SchedWriteRes<[V3UnitSA]> { let Latency = 1; } +def V3Write_2c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 2; } +def V3Write_2c_1M_1Flg : SchedWriteRes<[V3UnitM, V3UnitFlg]> { let Latency = 2; } +def V3Write_3c_1M : SchedWriteRes<[V3UnitM]> { let Latency = 3; } +def V3Write_2c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 2; } +def V3Write_3c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 3; } +def V3Write_4c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 4; } +def V3Write_12c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 12; + let ReleaseAtCycles = [12]; } +def V3Write_20c_1M0 : SchedWriteRes<[V3UnitM0]> { let Latency = 20; + let ReleaseAtCycles = [20]; } +def V3Write_4c_1L : SchedWriteRes<[V3UnitL]> { let Latency = 4; } +def V3Write_6c_1L : SchedWriteRes<[V3UnitL]> { let Latency = 6; } +def V3Write_2c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 2; } +def V3Write_2c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 2; } +def V3Write_3c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Write_3c_1V01 : SchedWriteRes<[V3UnitV01]> { let Latency = 3; + let ReleaseAtCycles = [2]; } +def V3Write_4c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Write_5c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Write_6c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 6; } +def V3Write_12c_1V : SchedWriteRes<[V3UnitV]> { let Latency = 12; } +def V3Write_3c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 3; } +def V3Write_3c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 3; } +def V3Write_4c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 4; } +def V3Write_4c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Write_9c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 9; } +def V3Write_10c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 10; } +def V3Write_8c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 8; } +def V3Write_12c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 12; + let ReleaseAtCycles = [11]; } +def V3Write_13c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 13; } +def V3Write_15c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 15; } +def V3Write_13c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 13; } +def V3Write_16c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 16; } +def V3Write_16c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 16; + let ReleaseAtCycles = [8]; } +def V3Write_20c_1V0 : SchedWriteRes<[V3UnitV0]> { let Latency = 20; + let ReleaseAtCycles = [20]; } +def V3Write_2c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 2; } +def V3Write_2c_1V13 : SchedWriteRes<[V3UnitV13]> { let Latency = 2; } +def V3Write_3c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 3; } +def V3Write_3c_1V13 : SchedWriteRes<[V3UnitV13]> { let Latency = 3; } +def V3Write_4c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 4; } +def V3Write_6c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 6; } +def V3Write_10c_1V1 : SchedWriteRes<[V3UnitV1]> { let Latency = 10; } +def V3Write_6c_1SA : SchedWriteRes<[V3UnitSA]> { let Latency = 6; } + +//===----------------------------------------------------------------------===// +// Define generic 2 micro-op types + +def V3Write_1c_1B_1S : SchedWriteRes<[V3UnitB, V3UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3Write_6c_1M0_1B : SchedWriteRes<[V3UnitM0, V3UnitB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_9c_1M0_1L : SchedWriteRes<[V3UnitM0, V3UnitL]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def V3Write_3c_1I_1M : SchedWriteRes<[V3UnitI, V3UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3Write_1c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3Write_3c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3Write_4c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_5c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_6c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_7c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def V3Write_1c_1SA_1D : SchedWriteRes<[V3UnitSA, V3UnitD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3Write_5c_1M0_1V : SchedWriteRes<[V3UnitM0, V3UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_2c_1SA_1V01 : SchedWriteRes<[V3UnitSA, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3Write_2c_2V01 : SchedWriteRes<[V3UnitV01, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3Write_4c_1SA_1V01 : SchedWriteRes<[V3UnitSA, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_5c_1V13_1V : SchedWriteRes<[V3UnitV13, V3UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_4c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_4c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_4c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_6c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_6c_2L : SchedWriteRes<[V3UnitL, V3UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_8c_1L_1V : SchedWriteRes<[V3UnitL, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def V3Write_4c_1SA_1V : SchedWriteRes<[V3UnitSA, V3UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_3c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3Write_4c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3Write_1c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3Write_2c_1M0_1M : SchedWriteRes<[V3UnitM0, V3UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3Write_6c_2V1 : SchedWriteRes<[V3UnitV1, V3UnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_5c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_5c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_5c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3Write_6c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_7c_1M0_1V02 : SchedWriteRes<[V3UnitM0, V3UnitV02]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def V3Write_2c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3Write_3c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3Write_6c_1V_1V13 : SchedWriteRes<[V3UnitV, V3UnitV13]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_6c_1L_1M : SchedWriteRes<[V3UnitL, V3UnitM]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_6c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_6c_2V13 : SchedWriteRes<[V3UnitV13, V3UnitV13]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3Write_8c_1M0_1V01 : SchedWriteRes<[V3UnitM0, V3UnitV01]> { + let Latency = 8; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define generic 3 micro-op types + +def V3Write_1c_1SA_1D_1I : SchedWriteRes<[V3UnitSA, V3UnitD, V3UnitI]> { + let Latency = 1; + let NumMicroOps = 3; +} + +def V3Write_2c_1SA_1V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitI]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3Write_2c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3Write_4c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def V3Write_9c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def V3Write_4c_3V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def V3Write_7c_1M_1M0_1V : SchedWriteRes<[V3UnitM, V3UnitM0, V3UnitV]> { + let Latency = 7; + let NumMicroOps = 3; +} + +def V3Write_2c_1SA_1I_1V01 : SchedWriteRes<[V3UnitSA, V3UnitI, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3Write_6c_3L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def V3Write_6c_3V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def V3Write_8c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define generic 4 micro-op types + +def V3Write_2c_1SA_2V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitI]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def V3Write_2c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def V3Write_4c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def V3Write_5c_1I_3L : SchedWriteRes<[V3UnitI, V3UnitL, V3UnitL, V3UnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def V3Write_6c_4V0 : SchedWriteRes<[V3UnitV0, V3UnitV0, V3UnitV0, V3UnitV0]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3Write_8c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3Write_6c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13, + V3UnitV13]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3Write_8c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13, + V3UnitV13]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3Write_6c_4V02 : SchedWriteRes<[V3UnitV02, V3UnitV02, V3UnitV02, + V3UnitV02]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3Write_6c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3Write_8c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3Write_9c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def V3Write_2c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV, + V3UnitV]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def V3Write_4c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV, + V3UnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def V3Write_8c_2M0_2V02 : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitV02, + V3UnitV02]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3Write_8c_2V_2V1 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV1, + V3UnitV1]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3Write_4c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM, + V3UnitM]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def V3Write_5c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM, + V3UnitM]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def V3Write_6c_2I_2L : SchedWriteRes<[V3UnitI, V3UnitI, V3UnitL, V3UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3Write_7c_4L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def V3Write_6c_1SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitV01]> { + let Latency = 6; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define generic 5 micro-op types + +def V3Write_2c_1SA_2V01_2I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitI, V3UnitI]> { + let Latency = 2; + let NumMicroOps = 5; +} + +def V3Write_8c_2L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def V3Write_9c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def V3Write_10c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 10; + let NumMicroOps = 5; +} + +def V3Write_6c_5V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 6; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define generic 6 micro-op types + +def V3Write_8c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def V3Write_9c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3Write_9c_2L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3Write_9c_2L_2V_2I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, + V3UnitV, V3UnitI, V3UnitI]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3Write_9c_2V_4V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13, + V3UnitV13, V3UnitV13, V3UnitV13]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3Write_2c_3SA_3V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 2; + let NumMicroOps = 6; +} + +def V3Write_4c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def V3Write_5c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def V3Write_2c_3SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 6; +} + +def V3Write_4c_2SA_2I_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitI, + V3UnitI, V3UnitV01, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define generic 7 micro-op types + +def V3Write_8c_3L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitV, V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 7; +} + +//===----------------------------------------------------------------------===// +// Define generic 8 micro-op types + +def V3Write_2c_4SA_4V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitV, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 2; + let NumMicroOps = 8; +} + +def V3Write_2c_4SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01]> { + let Latency = 2; + let NumMicroOps = 8; +} + +def V3Write_6c_2SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01]> { + let Latency = 6; + let NumMicroOps = 8; +} + +def V3Write_8c_4L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL, + V3UnitV, V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 8; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define generic 9 micro-op types + +def V3Write_6c_3SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def V3Write_10c_1L_8V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV, V3UnitV, + V3UnitV]> { + let Latency = 10; + let NumMicroOps = 9; +} + +def V3Write_10c_3V_3L_3I : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, + V3UnitL, V3UnitL, V3UnitL, + V3UnitI, V3UnitI, V3UnitI]> { + let Latency = 10; + let NumMicroOps = 9; +} + +//===----------------------------------------------------------------------===// +// Define generic 10 micro-op types + +def V3Write_9c_6L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL, + V3UnitL, V3UnitL, V3UnitV, V3UnitV, + V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define generic 12 micro-op types + +def V3Write_5c_4SA_8V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 5; + let NumMicroOps = 12; +} + +def V3Write_9c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitL, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 9; + let NumMicroOps = 12; +} + +def V3Write_10c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitL, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV]> { + let Latency = 10; + let NumMicroOps = 12; +} + +def V3Write_4c_6SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 4; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define generic 16 micro-op types + +def V3Write_7c_4SA_12V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01]> { + let Latency = 7; + let NumMicroOps = 16; +} + +def V3Write_10c_4L_8V_4I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, + V3UnitL, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV, + V3UnitV, V3UnitV, V3UnitV, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI]> { + let Latency = 10; + let NumMicroOps = 16; +} + +//===----------------------------------------------------------------------===// +// Define generic 18 micro-op types + +def V3Write_7c_9SA_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01]> { + let Latency = 7; + let NumMicroOps = 18; +} + +//===----------------------------------------------------------------------===// +// Define generic 27 micro-op types + +def V3Write_7c_9SA_9I_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01]> { + let Latency = 7; + let NumMicroOps = 27; +} + +//===----------------------------------------------------------------------===// +// Define generic 36 micro-op types + +def V3Write_11c_18SA_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, V3UnitSA, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01]> { + let Latency = 11; + let NumMicroOps = 36; +} + +//===----------------------------------------------------------------------===// +// Define generic 54 micro-op types + +def V3Write_11c_18SA_18I_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitSA, V3UnitSA, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitI, V3UnitI, V3UnitI, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01, + V3UnitV01, V3UnitV01]> { + let Latency = 11; + let NumMicroOps = 54; +} + +//===----------------------------------------------------------------------===// +// Define predicate-controlled types + +def V3Write_ArithI : SchedWriteVariant<[ + SchedVar<IsCheapLSL, [V3Write_1c_1I]>, + SchedVar<NoSchedPred, [V3Write_2c_1M]>]>; + +def V3Write_ArithF : SchedWriteVariant<[ + SchedVar<IsCheapLSL, [V3Write_1c_1F_1Flg]>, + SchedVar<NoSchedPred, [V3Write_2c_1M_1Flg]>]>; + +def V3Write_Logical : SchedWriteVariant<[ + SchedVar<NeoverseNoLSL, [V3Write_1c_1F_1Flg]>, + SchedVar<NoSchedPred, [V3Write_2c_1M_1Flg]>]>; + +def V3Write_Extr : SchedWriteVariant<[ + SchedVar<IsRORImmIdiomPred, [V3Write_1c_1I]>, + SchedVar<NoSchedPred, [V3Write_3c_1I_1M]>]>; + +def V3Write_LdrHQ : SchedWriteVariant<[ + SchedVar<NeoverseHQForm, [V3Write_7c_1I_1L]>, + SchedVar<NoSchedPred, [V3Write_6c_1L]>]>; + +def V3Write_StrHQ : SchedWriteVariant<[ + SchedVar<NeoverseHQForm, [V3Write_2c_1SA_1V01_1I]>, + SchedVar<NoSchedPred, [V3Write_2c_1SA_1V01]>]>; + +def V3Write_0or1c_1I : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3Write_0c]>, + SchedVar<NoSchedPred, [V3Write_1c_1I]>]>; + +def V3Write_0or2c_1V : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3Write_0c]>, + SchedVar<NoSchedPred, [V3Write_2c_1V]>]>; + +def V3Write_0or3c_1M0 : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3Write_0c]>, + SchedVar<NoSchedPred, [V3Write_3c_1M0]>]>; + +def V3Write_2or3c_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3Write_3c_1M]>, + SchedVar<NoSchedPred, [V3Write_2c_1M]>]>; + +def V3Write_1or2c_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3Write_2c_1M]>, + SchedVar<NoSchedPred, [V3Write_1c_1M]>]>; + +def V3Write_3or4c_1M0_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3Write_4c_1M0_1M]>, + SchedVar<NoSchedPred, [V3Write_3c_1M0_1M]>]>; + +def V3Write_2or3c_1V0 : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3Write_3c_1V0]>, + SchedVar<NoSchedPred, [V3Write_2c_1V0]>]>; + +def V3Write_2or3c_1V0_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3Write_3c_1V0_1M]>, + SchedVar<NoSchedPred, [V3Write_2c_1V0_1M]>]>; + +def V3Write_IncDec : SchedWriteVariant<[ + SchedVar<NeoverseCheapIncDec, [V3Write_1c_1I]>, + SchedVar<NoSchedPred, [V3Write_2c_1M]>]>; + +//===----------------------------------------------------------------------===// +// Define forwarded types + +// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? +def V3Wr_IM : SchedWriteRes<[V3UnitM]> { let Latency = 2; } + +def V3Wr_FMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_FMA : SchedReadAdvance<2, [WriteFMul, V3Wr_FMA]>; + +def V3Wr_VA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VA : SchedReadAdvance<3, [V3Wr_VA]>; + +def V3Wr_VDOT : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Rd_VDOT : SchedReadAdvance<2, [V3Wr_VDOT]>; + +def V3Wr_VMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Rd_VMMA : SchedReadAdvance<2, [V3Wr_VMMA]>; + +def V3Wr_VMA : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Rd_VMA : SchedReadAdvance<3, [V3Wr_VMA]>; + +def V3Wr_VMAH : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 4; } +def V3Rd_VMAH : SchedReadAdvance<2, [V3Wr_VMAH]>; + +def V3Wr_VMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Rd_VMAL : SchedReadAdvance<3, [V3Wr_VMAL]>; + +def V3Wr_VPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VPA : SchedReadAdvance<3, [V3Wr_VPA]>; + +def V3Wr_VSA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VSA : SchedReadAdvance<3, [V3Wr_VSA]>; + +def V3Wr_VFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VFCMA : SchedReadAdvance<2, [V3Wr_VFCMA]>; + +def V3Wr_VFM : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Wr_VFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VFMA : SchedReadAdvance<2, [V3Wr_VFM, V3Wr_VFMA]>; + +def V3Wr_VFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_VFMAL : SchedReadAdvance<2, [V3Wr_VFMAL]>; + +def V3Wr_VBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Rd_VBFDOT : SchedReadAdvance<2, [V3Wr_VBFDOT]>; +def V3Wr_VBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; } +def V3Rd_VBFMMA : SchedReadAdvance<2, [V3Wr_VBFMMA]>; +def V3Wr_VBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Rd_VBFMAL : SchedReadAdvance<3, [V3Wr_VBFMAL]>; + +def V3Wr_CRC : SchedWriteRes<[V3UnitM0]> { let Latency = 2; } +def V3Rd_CRC : SchedReadAdvance<1, [V3Wr_CRC]>; + +def V3Wr_ZA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_ZA : SchedReadAdvance<3, [V3Wr_ZA]>; +def V3Wr_ZPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_ZPA : SchedReadAdvance<3, [V3Wr_ZPA]>; +def V3Wr_ZSA : SchedWriteRes<[V3UnitV13]> { let Latency = 4; } +def V3Rd_ZSA : SchedReadAdvance<3, [V3Wr_ZSA]>; + +def V3Wr_ZDOTB : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Rd_ZDOTB : SchedReadAdvance<2, [V3Wr_ZDOTB]>; +def V3Wr_ZDOTH : SchedWriteRes<[V3UnitV02]> { let Latency = 3; } +def V3Rd_ZDOTH : SchedReadAdvance<2, [V3Wr_ZDOTH]>; + +// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce +// throughput to 1 in case of forwarding? +def V3Wr_ZCMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Rd_ZCMABHS : SchedReadAdvance<3, [V3Wr_ZCMABHS]>; +def V3Wr_ZCMAD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; } +def V3Rd_ZCMAD : SchedReadAdvance<2, [V3Wr_ZCMAD]>; + +def V3Wr_ZMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; } +def V3Rd_ZMMA : SchedReadAdvance<2, [V3Wr_ZMMA]>; + +def V3Wr_ZMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Rd_ZMABHS : SchedReadAdvance<3, [V3Wr_ZMABHS]>; +def V3Wr_ZMAD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; } +def V3Rd_ZMAD : SchedReadAdvance<2, [V3Wr_ZMAD]>; + +def V3Wr_ZMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Rd_ZMAL : SchedReadAdvance<3, [V3Wr_ZMAL]>; + +def V3Wr_ZMASQL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Wr_ZMASQBHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; } +def V3Wr_ZMASQD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; } +def V3Rd_ZMASQ : SchedReadAdvance<2, [V3Wr_ZMASQL, V3Wr_ZMASQBHS, + V3Wr_ZMASQD]>; + +def V3Wr_ZFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Rd_ZFCMA : SchedReadAdvance<3, [V3Wr_ZFCMA]>; + +def V3Wr_ZFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_ZFMA : SchedReadAdvance<2, [V3Wr_ZFMA]>; + +def V3Wr_ZFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; } +def V3Rd_ZFMAL : SchedReadAdvance<2, [V3Wr_ZFMAL]>; + +def V3Wr_ZBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Rd_ZBFDOT : SchedReadAdvance<2, [V3Wr_ZBFDOT]>; +def V3Wr_ZBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; } +def V3Rd_ZBFMMA : SchedReadAdvance<2, [V3Wr_ZBFMMA]>; +def V3Wr_ZBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; } +def V3Rd_ZBFMAL : SchedReadAdvance<3, [V3Wr_ZBFMAL]>; + +//===----------------------------------------------------------------------===// +// Define types with long resource cycles (rc) + +def V3Write_6c_1V1_5rc : SchedWriteRes<[V3UnitV1]> { let Latency = 6; let ReleaseAtCycles = [ 5]; } +def V3Write_9c_1V1_2rc : SchedWriteRes<[V3UnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 2]; } +def V3Write_9c_1V1_4rc : SchedWriteRes<[V3UnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 4]; } +def V3Write_10c_1V1_9rc : SchedWriteRes<[V3UnitV1]> { let Latency = 10; let ReleaseAtCycles = [ 9]; } +def V3Write_11c_1V1_4rc : SchedWriteRes<[V3UnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; } +def V3Write_13c_1V1_8rc : SchedWriteRes<[V3UnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; } +def V3Write_14c_1V1_2rc : SchedWriteRes<[V3UnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; } + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + +// §3.3 Branch instructions +// ----------------------------------------------------------------------------- + +// Branch, immed +// Compare and branch +def : SchedAlias<WriteBr, V3Write_1c_1B>; + +// Branch, register +def : SchedAlias<WriteBrReg, V3Write_1c_1B>; + +// Branch and link, immed +// Branch and link, register +def : InstRW<[V3Write_1c_1B_1S], (instrs BL, BLR)>; + +// §3.4 Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +// ALU, basic +def : SchedAlias<WriteI, V3Write_1c_1I>; + +// ALU, basic, flagset +def : InstRW<[V3Write_1c_1F_1Flg], + (instregex "^(ADD|SUB)S[WX]r[ir]$", + "^(ADC|SBC)S[WX]r$", + "^ANDS[WX]ri$", + "^(AND|BIC)S[WX]rr$")>; +def : InstRW<[V3Write_0or1c_1I], (instregex "^MOVZ[WX]i$")>; + +// ALU, extend and shift +def : SchedAlias<WriteIEReg, V3Write_2c_1M>; + +// Arithmetic, LSL shift, shift <= 4 +// Arithmetic, flagset, LSL shift, shift <= 4 +// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4 +def : SchedAlias<WriteISReg, V3Write_ArithI>; +def : InstRW<[V3Write_ArithF], + (instregex "^(ADD|SUB)S[WX]rs$")>; + +// Arithmetic, immediate to logical address tag +def : InstRW<[V3Write_2c_1M], (instrs ADDG, SUBG)>; + +// Conditional compare +def : InstRW<[V3Write_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>; + +// Convert floating-point condition flags +// Flag manipulation instructions +def : WriteRes<WriteSys, []> { let Latency = 1; } + +// Insert Random Tags +def : InstRW<[V3Write_2c_1M], (instrs IRG, IRGstack)>; + +// Insert Tag Mask +// Subtract Pointer +def : InstRW<[V3Write_1c_1I], (instrs GMI, SUBP)>; + +// Subtract Pointer, flagset +def : InstRW<[V3Write_1c_1F_1Flg], (instrs SUBPS)>; + +// Logical, shift, no flagset +def : InstRW<[V3Write_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>; +def : InstRW<[V3Write_0or1c_1I], (instregex "^ORR[WX]rs$")>; + +// Logical, shift, flagset +def : InstRW<[V3Write_Logical], (instregex "^(AND|BIC)S[WX]rs$")>; + +// Move and shift instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteImm, V3Write_1c_1I>; + +// §3.5 Divide and multiply instructions +// ----------------------------------------------------------------------------- + +// SDIV, UDIV +def : SchedAlias<WriteID32, V3Write_12c_1M0>; +def : SchedAlias<WriteID64, V3Write_20c_1M0>; + +def : SchedAlias<WriteIM32, V3Write_2c_1M>; +def : SchedAlias<WriteIM64, V3Write_2c_1M>; + +// Multiply +// Multiply accumulate, W-form +// Multiply accumulate, X-form +def : InstRW<[V3Wr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>; + +// Multiply accumulate long +// Multiply long +def : InstRW<[V3Wr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; + +// Multiply high +def : InstRW<[V3Write_3c_1M], (instrs SMULHrr, UMULHrr)>; + +// §3.6 Pointer Authentication Instructions (v8.3 PAC) +// ----------------------------------------------------------------------------- + +// Authenticate data address +// Authenticate instruction address +// Compute pointer authentication code for data address +// Compute pointer authentication code, using generic key +// Compute pointer authentication code for instruction address +def : InstRW<[V3Write_4c_1M0], (instregex "^AUT", "^PAC")>; + +// Branch and link, register, with pointer authentication +// Branch, register, with pointer authentication +// Branch, return, with pointer authentication +def : InstRW<[V3Write_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, + BRAAZ, BRAB, BRABZ, RETAA, RETAB, + ERETAA, ERETAB)>; + + +// Load register, with pointer authentication +def : InstRW<[V3Write_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>; + +// Strip pointer authentication code +def : InstRW<[V3Write_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>; + +// §3.7 Miscellaneous data-processing instructions +// ----------------------------------------------------------------------------- + +// Address generation +def : InstRW<[V3Write_1c_1I], (instrs ADR, ADRP)>; + +// Bitfield extract, one reg +// Bitfield extract, two regs +def : SchedAlias<WriteExtr, V3Write_Extr>; +def : InstRW<[V3Write_Extr], (instrs EXTRWrri, EXTRXrri)>; + +// Bitfield move, basic +def : SchedAlias<WriteIS, V3Write_1c_1I>; + +// Bitfield move, insert +def : InstRW<[V3Write_2c_1M], (instregex "^BFM[WX]ri$")>; + +// §3.8 Load instructions +// ----------------------------------------------------------------------------- + +// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3. + +def : SchedAlias<WriteLD, V3Write_4c_1L>; +def : SchedAlias<WriteLDIdx, V3Write_4c_1L>; + +// Load register, literal +def : InstRW<[V3Write_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>; + +// Load pair, signed immed offset, signed words +def : InstRW<[V3Write_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>; + +// Load pair, immed post-index or immed pre-index, signed words +def : InstRW<[WriteAdr, V3Write_5c_1I_3L, WriteLDHi], + (instregex "^LDPSW(post|pre)$")>; + +// §3.9 Store instructions +// ----------------------------------------------------------------------------- + +// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I. + +def : SchedAlias<WriteST, V3Write_1c_1SA_1D>; +def : SchedAlias<WriteSTIdx, V3Write_1c_1SA_1D>; +def : SchedAlias<WriteSTP, V3Write_1c_1SA_1D>; +def : SchedAlias<WriteAdr, V3Write_1c_1I>; + +// §3.10 Tag load instructions +// ----------------------------------------------------------------------------- + +// Load allocation tag +// Load multiple allocation tags +def : InstRW<[V3Write_4c_1L], (instrs LDG, LDGM)>; + +// §3.11 Tag store instructions +// ----------------------------------------------------------------------------- + +// Store allocation tags to one or two granules, post-index +// Store allocation tags to one or two granules, pre-index +// Store allocation tag to one or two granules, zeroing, post-index +// Store Allocation Tag to one or two granules, zeroing, pre-index +// Store allocation tag and reg pair to memory, post-Index +// Store allocation tag and reg pair to memory, pre-Index +def : InstRW<[V3Write_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex, + ST2GPreIndex, ST2GPostIndex, + STZGPreIndex, STZGPostIndex, + STZ2GPreIndex, STZ2GPostIndex, + STGPpre, STGPpost)>; + +// Store allocation tags to one or two granules, signed offset +// Store allocation tag to two granules, zeroing, signed offset +// Store allocation tag and reg pair to memory, signed offset +// Store multiple allocation tags +def : InstRW<[V3Write_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi, + STZ2Gi, STGPi, STGM, STZGM)>; + +// §3.12 FP data processing instructions +// ----------------------------------------------------------------------------- + +// FP absolute value +// FP arithmetic +// FP min/max +// FP negate +// FP select +def : SchedAlias<WriteF, V3Write_2c_1V>; + +// FP compare +def : SchedAlias<WriteFCmp, V3Write_2c_1V0>; + +// FP divide, square root +def : SchedAlias<WriteFDiv, V3Write_6c_1V1>; + +// FP divide, H-form +def : InstRW<[V3Write_6c_1V1], (instrs FDIVHrr)>; +// FP divide, S-form +def : InstRW<[V3Write_8c_1V1], (instrs FDIVSrr)>; +// FP divide, D-form +def : InstRW<[V3Write_13c_1V1], (instrs FDIVDrr)>; + +// FP square root, H-form +def : InstRW<[V3Write_6c_1V1], (instrs FSQRTHr)>; +// FP square root, S-form +def : InstRW<[V3Write_8c_1V1], (instrs FSQRTSr)>; +// FP square root, D-form +def : InstRW<[V3Write_13c_1V1], (instrs FSQRTDr)>; + +// FP multiply +def : WriteRes<WriteFMul, [V3UnitV]> { let Latency = 3; } + +// FP multiply accumulate +def : InstRW<[V3Wr_FMA, ReadDefault, ReadDefault, V3Rd_FMA], + (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; + +// FP round to integral +def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$", + "^FRINT(32|64)[XZ][SD]r$")>; + +// §3.13 FP miscellaneous instructions +// ----------------------------------------------------------------------------- + +// FP convert, from gen to vec reg +def : InstRW<[V3Write_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>; + +// FP convert, from vec to gen reg +def : InstRW<[V3Write_3c_1V01], + (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>; + +// FP convert, Javascript from vec to gen reg +def : SchedAlias<WriteFCvt, V3Write_3c_1V0>; + +// FP convert, from vec to vec reg +def : InstRW<[V3Write_3c_1V02], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr, + FCVTHDr, FCVTSDr, FCVTXNv1i64)>; + +// FP move, immed +// FP move, register +def : SchedAlias<WriteFImm, V3Write_2c_1V>; + +// FP transfer, from gen to low half of vec reg +def : InstRW<[V3Write_0or3c_1M0], + (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>; + +// FP transfer, from gen to high half of vec reg +def : InstRW<[V3Write_5c_1M0_1V], (instrs FMOVXDHighr)>; + +// FP transfer, from vec to gen reg +def : SchedAlias<WriteFCopy, V3Write_2c_2V01>; + +// §3.14 FP load instructions +// ----------------------------------------------------------------------------- + +// Load vector reg, literal, S/D/Q forms +def : InstRW<[V3Write_7c_1I_1L], (instregex "^LDR[SDQ]l$")>; + +// Load vector reg, unscaled immed +def : InstRW<[V3Write_6c_1L], (instregex "^LDUR[BHSDQ]i$")>; + +// Load vector reg, immed post-index +// Load vector reg, immed pre-index +def : InstRW<[WriteAdr, V3Write_6c_1I_1L], + (instregex "^LDR[BHSDQ](pre|post)$")>; + +// Load vector reg, unsigned immed +def : InstRW<[V3Write_6c_1L], (instregex "^LDR[BHSDQ]ui$")>; + +// Load vector reg, register offset, basic +// Load vector reg, register offset, scale, S/D-form +// Load vector reg, register offset, scale, H/Q-form +// Load vector reg, register offset, extend +// Load vector reg, register offset, extend, scale, S/D-form +// Load vector reg, register offset, extend, scale, H/Q-form +def : InstRW<[V3Write_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>; + +// Load vector pair, immed offset, S/D-form +def : InstRW<[V3Write_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>; + +// Load vector pair, immed offset, Q-form +def : InstRW<[V3Write_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>; + +// Load vector pair, immed post-index, S/D-form +// Load vector pair, immed pre-index, S/D-form +def : InstRW<[WriteAdr, V3Write_6c_1I_1L, WriteLDHi], + (instregex "^LDP[SD](pre|post)$")>; + +// Load vector pair, immed post-index, Q-form +// Load vector pair, immed pre-index, Q-form +def : InstRW<[WriteAdr, V3Write_6c_2I_2L, WriteLDHi], (instrs LDPQpost, + LDPQpre)>; + +// §3.15 FP store instructions +// ----------------------------------------------------------------------------- + +// Store vector reg, unscaled immed, B/H/S/D-form +// Store vector reg, unscaled immed, Q-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STUR[BHSDQ]i$")>; + +// Store vector reg, immed post-index, B/H/S/D-form +// Store vector reg, immed post-index, Q-form +// Store vector reg, immed pre-index, B/H/S/D-form +// Store vector reg, immed pre-index, Q-form +def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I], + (instregex "^STR[BHSDQ](pre|post)$")>; + +// Store vector reg, unsigned immed, B/H/S/D-form +// Store vector reg, unsigned immed, Q-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STR[BHSDQ]ui$")>; + +// Store vector reg, register offset, basic, B/H/S/D-form +// Store vector reg, register offset, basic, Q-form +// Store vector reg, register offset, scale, H-form +// Store vector reg, register offset, scale, S/D-form +// Store vector reg, register offset, scale, Q-form +// Store vector reg, register offset, extend, B/H/S/D-form +// Store vector reg, register offset, extend, Q-form +// Store vector reg, register offset, extend, scale, H-form +// Store vector reg, register offset, extend, scale, S/D-form +// Store vector reg, register offset, extend, scale, Q-form +def : InstRW<[V3Write_StrHQ, ReadAdrBase], + (instregex "^STR[BHSDQ]ro[WX]$")>; + +// Store vector pair, immed offset, S-form +// Store vector pair, immed offset, D-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STN?P[SD]i$")>; + +// Store vector pair, immed offset, Q-form +def : InstRW<[V3Write_2c_1SA_2V01], (instrs STPQi, STNPQi)>; + +// Store vector pair, immed post-index, S-form +// Store vector pair, immed post-index, D-form +// Store vector pair, immed pre-index, S-form +// Store vector pair, immed pre-index, D-form +def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I], + (instregex "^STP[SD](pre|post)$")>; + +// Store vector pair, immed post-index, Q-form +def : InstRW<[V3Write_2c_1SA_2V01_1I], (instrs STPQpost)>; + +// Store vector pair, immed pre-index, Q-form +def : InstRW<[V3Write_2c_1SA_2V01_2I], (instrs STPQpre)>; + +// §3.16 ASIMD integer instructions +// ----------------------------------------------------------------------------- + +// ASIMD absolute diff +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD arith, pair-wise +// ASIMD compare +// ASIMD logical +// ASIMD max/min, basic and pair-wise +def : SchedAlias<WriteVd, V3Write_2c_1V>; +def : SchedAlias<WriteVq, V3Write_2c_1V>; + +// ASIMD absolute diff accum +// ASIMD absolute diff accum long +def : InstRW<[V3Wr_VA, V3Rd_VA], (instregex "^[SU]ABAL?v")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[V3Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; + +// ASIMD arith, reduce, 8B/8H +def : InstRW<[V3Write_5c_1V13_1V], + (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; + +// ASIMD arith, reduce, 16B +def : InstRW<[V3Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>; + +// ASIMD dot product +// ASIMD dot product using signed and unsigned integers +def : InstRW<[V3Wr_VDOT, V3Rd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; + +// ASIMD matrix multiply-accumulate +def : InstRW<[V3Wr_VMMA, V3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[V3Write_3c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$", + "^[SU](MAX|MIN)Vv4i32v$")>; + +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[V3Write_5c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", + "^[SU](MAX|MIN)Vv8i16v$")>; + +// ASIMD max/min, reduce, 16B +def : InstRW<[V3Write_6c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>; + +// ASIMD multiply +def : InstRW<[V3Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; + +// ASIMD multiply accumulate +def : InstRW<[V3Wr_VMA, V3Rd_VMA], (instregex "^MLAv", "^MLSv")>; + +// ASIMD multiply accumulate high +def : InstRW<[V3Wr_VMAH, V3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + +// ASIMD multiply accumulate long +def : InstRW<[V3Wr_VMAL, V3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + +// ASIMD multiply accumulate saturating long +def : InstRW<[V3Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>; + +// ASIMD multiply/multiply long (8x8) polynomial, D-form +// ASIMD multiply/multiply long (8x8) polynomial, Q-form +def : InstRW<[V3Write_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>; + +// ASIMD multiply long +def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>; + +// ASIMD pairwise add and accumulate long +def : InstRW<[V3Wr_VPA, V3Rd_VPA], (instregex "^[SU]ADALPv")>; + +// ASIMD shift accumulate +def : InstRW<[V3Wr_VSA, V3Rd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>; + +// ASIMD shift by immed, basic +def : InstRW<[V3Write_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv", + "^SSHLLv", "^SSHR[dv]", "^USHLLv", + "^USHR[dv]")>; + +// ASIMD shift by immed and insert, basic +def : InstRW<[V3Write_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>; + +// ASIMD shift by immed, complex +def : InstRW<[V3Write_4c_1V], + (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$", + "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", + "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]", + "^UQSHRN[bhsv]", "^URSHR[dv]")>; + +// ASIMD shift by register, basic +def : InstRW<[V3Write_2c_1V], (instregex "^[SU]SHLv")>; + +// ASIMD shift by register, complex +def : InstRW<[V3Write_4c_1V], + (instregex "^[SU]RSHLv", "^[SU]QRSHLv", + "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>; + +// §3.17 ASIMD floating-point instructions +// ----------------------------------------------------------------------------- + +// ASIMD FP absolute value/difference +// ASIMD FP arith, normal +// ASIMD FP compare +// ASIMD FP complex add +// ASIMD FP max/min, normal +// ASIMD FP max/min, pairwise +// ASIMD FP negate +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD FP complex multiply add +def : InstRW<[V3Wr_VFCMA, V3Rd_VFCMA], (instregex "^FCMLAv")>; + +// ASIMD FP convert, long (F16 to F32) +def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTL(v4|v8)i16")>; + +// ASIMD FP convert, long (F32 to F64) +def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTL(v2|v4)i32")>; + +// ASIMD FP convert, narrow (F32 to F16) +def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTN(v4|v8)i16")>; + +// ASIMD FP convert, narrow (F64 to F32) +def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTN(v2|v4)i32", + "^FCVTXN(v2|v4)f32")>; + +// ASIMD FP convert, other, D-form F32 and Q-form F64 +def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$", + "^FCVT[AMNPZ][SU]v2i(32|64)_shift$", + "^FCVT[AMNPZ][SU]v1i64$", + "^FCVTZ[SU]d$", + "^[SU]CVTFv2f(32|64)$", + "^[SU]CVTFv2i(32|64)_shift$", + "^[SU]CVTFv1i64$", + "^[SU]CVTFd$")>; + +// ASIMD FP convert, other, D-form F16 and Q-form F32 +def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$", + "^FCVT[AMNPZ][SU]v4i(16|32)_shift$", + "^FCVT[AMNPZ][SU]v1i32$", + "^FCVTZ[SU]s$", + "^[SU]CVTFv4f(16|32)$", + "^[SU]CVTFv4i(16|32)_shift$", + "^[SU]CVTFv1i32$", + "^[SU]CVTFs$")>; + +// ASIMD FP convert, other, Q-form F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$", + "^FCVT[AMNPZ][SU]v8i16_shift$", + "^FCVT[AMNPZ][SU]v1f16$", + "^FCVTZ[SU]h$", + "^[SU]CVTFv8f16$", + "^[SU]CVTFv8i16_shift$", + "^[SU]CVTFv1i16$", + "^[SU]CVTFh$")>; + +// ASIMD FP divide, D-form, F16 +def : InstRW<[V3Write_9c_1V1_4rc], (instrs FDIVv4f16)>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[V3Write_9c_1V1_2rc], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F16 +def : InstRW<[V3Write_13c_1V1_8rc], (instrs FDIVv8f16)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[V3Write_11c_1V1_4rc], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[V3Write_14c_1V1_2rc], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, reduce, F32 and D-form F16 +def : InstRW<[V3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; + +// ASIMD FP max/min, reduce, Q-form F16 +def : InstRW<[V3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; + +// ASIMD FP multiply +def : InstRW<[V3Wr_VFM], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate +def : InstRW<[V3Wr_VFMA, V3Rd_VFMA], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[V3Wr_VFMAL, V3Rd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>; + +// ASIMD FP round, D-form F32 and Q-form F64 +def : InstRW<[V3Write_3c_1V02], + (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", + "^FRINT(32|64)[XZ]v2f(32|64)$")>; + +// ASIMD FP round, D-form F16 and Q-form F32 +def : InstRW<[V3Write_4c_2V02], + (instregex "^FRINT[AIMNPXZ]v4f(16|32)$", + "^FRINT(32|64)[XZ]v4f32$")>; + +// ASIMD FP round, Q-form F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>; + +// ASIMD FP square root, D-form, F16 +def : InstRW<[V3Write_9c_1V1_4rc], (instrs FSQRTv4f16)>; + +// ASIMD FP square root, D-form, F32 +def : InstRW<[V3Write_9c_1V1_2rc], (instrs FSQRTv2f32)>; + +// ASIMD FP square root, Q-form, F16 +def : InstRW<[V3Write_13c_1V1_8rc], (instrs FSQRTv8f16)>; + +// ASIMD FP square root, Q-form, F32 +def : InstRW<[V3Write_11c_1V1_4rc], (instrs FSQRTv4f32)>; + +// ASIMD FP square root, Q-form, F64 +def : InstRW<[V3Write_14c_1V1_2rc], (instrs FSQRTv2f64)>; + +// §3.18 ASIMD BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// ASIMD convert, F32 to BF16 +def : InstRW<[V3Write_4c_2V02], (instrs BFCVTN, BFCVTN2)>; + +// ASIMD dot product +def : InstRW<[V3Wr_VBFDOT, V3Rd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>; + +// ASIMD matrix multiply accumulate +def : InstRW<[V3Wr_VBFMMA, V3Rd_VBFMMA], (instrs BFMMLA)>; + +// ASIMD multiply accumulate long +def : InstRW<[V3Wr_VBFMAL, V3Rd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT, + BFMLALTIdx)>; + +// Scalar convert, F32 to BF16 +def : InstRW<[V3Write_3c_1V02], (instrs BFCVT)>; + +// §3.19 ASIMD miscellaneous instructions +// ----------------------------------------------------------------------------- + +// ASIMD bit reverse +// ASIMD bitwise insert +// ASIMD count +// ASIMD duplicate, element +// ASIMD extract +// ASIMD extract narrow +// ASIMD insert, element to element +// ASIMD move, FP immed +// ASIMD move, integer immed +// ASIMD reverse +// ASIMD table lookup extension, 1 table reg +// ASIMD transpose +// ASIMD unzip/zip +// Handled by SchedAlias<WriteV[dq], ...> +def : InstRW<[V3Write_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>; + +// ASIMD duplicate, gen reg +def : InstRW<[V3Write_3c_1M0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract narrow, saturating +def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>; + +// ASIMD reciprocal and square root estimate, D-form U32 +def : InstRW<[V3Write_3c_1V02], (instrs URECPEv2i32, URSQRTEv2i32)>; + +// ASIMD reciprocal and square root estimate, Q-form U32 +def : InstRW<[V3Write_4c_2V02], (instrs URECPEv4i32, URSQRTEv4i32)>; + +// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms +def : InstRW<[V3Write_3c_1V02], (instrs FRECPEv1f16, FRECPEv1i32, + FRECPEv1i64, FRECPEv2f32, + FRSQRTEv1f16, FRSQRTEv1i32, + FRSQRTEv1i64, FRSQRTEv2f32)>; + +// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 +def : InstRW<[V3Write_4c_2V02], (instrs FRECPEv4f16, FRECPEv4f32, + FRSQRTEv4f16, FRSQRTEv4f32)>; + +// ASIMD reciprocal and square root estimate, Q-form F16 +def : InstRW<[V3Write_6c_4V02], (instrs FRECPEv8f16, FRSQRTEv8f16)>; + +// ASIMD reciprocal exponent +def : InstRW<[V3Write_3c_1V02], (instregex "^FRECPXv")>; + +// ASIMD reciprocal step +def : InstRW<[V3Write_4c_1V], (instregex "^FRECPS(32|64|v)", + "^FRSQRTS(32|64|v)")>; + +// ASIMD table lookup, 1 or 2 table regs +def : InstRW<[V3Write_2c_1V], (instrs TBLv8i8One, TBLv16i8One, + TBLv8i8Two, TBLv16i8Two)>; + +// ASIMD table lookup, 3 table regs +def : InstRW<[V3Write_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>; + +// ASIMD table lookup, 4 table regs +def : InstRW<[V3Write_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>; + +// ASIMD table lookup extension, 2 table reg +def : InstRW<[V3Write_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// ASIMD table lookup extension, 3 table reg +def : InstRW<[V3Write_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>; + +// ASIMD table lookup extension, 4 table reg +def : InstRW<[V3Write_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, element to gen reg +def : InstRW<[V3Write_2c_2V01], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, gen reg to element +def : InstRW<[V3Write_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>; + +// §3.20 ASIMD load instructions +// ----------------------------------------------------------------------------- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_1L], + (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_1L], + (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_2L], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_2L], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_3L], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_3L], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_7c_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_7c_4L], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[V3Write_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>; + +// ASIMD load, 2 element, multiple, Q-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[V3Write_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[V3Write_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[V3Write_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lane, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +def : InstRW<[V3Write_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[V3Write_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[V3Write_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +def : InstRW<[V3Write_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[V3Write_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; + +// §3.21 ASIMD store instructions +// ----------------------------------------------------------------------------- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_3SA_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +def : InstRW<[V3Write_2c_2SA_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[V3Write_2c_4SA_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_2c_4SA_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>; + +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[V3Write_4c_2SA_4V01], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[V3Write_4c_1SA_2V01], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_5c_2SA_4V01], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>; + +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[V3Write_6c_3SA_6V01], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3Write_6c_3SA_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[V3Write_5c_2SA_4V01], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +def : InstRW<[V3Write_6c_2SA_6V01], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3Write_6c_2SA_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, B/H/S +def : InstRW<[V3Write_7c_4SA_12V01], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[WriteAdr, V3Write_7c_4SA_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[V3Write_5c_4SA_8V01], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[WriteAdr, V3Write_5c_4SA_8V01], (instregex "ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H/S +def : InstRW<[V3Write_6c_1SA_3V01], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[WriteAdr, V3Write_6c_1SA_3V01], (instregex "ST4i(8|16|32)_POST$")>; + +// ASIMD store, 4 element, one lane, D +def : InstRW<[V3Write_4c_2SA_4V01], (instregex "ST4i(64)$")>; +def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01], (instregex "ST4i(64)_POST$")>; + +// §3.22 Cryptography extensions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[V3Write_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 hash acceleration op +// Crypto SHA1 schedule acceleration ops +def : InstRW<[V3Write_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>; + +// Crypto SHA1 hash acceleration ops +// Crypto SHA256 hash acceleration ops +def : InstRW<[V3Write_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>; + +// Crypto SHA256 schedule acceleration ops +def : InstRW<[V3Write_2c_1V0], (instregex "^SHA256SU[01]")>; + +// Crypto SHA512 hash acceleration ops +def : InstRW<[V3Write_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>; + +// Crypto SHA3 ops +def : InstRW<[V3Write_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>; + +// Crypto SM3 ops +def : InstRW<[V3Write_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$", + "^SM3TT[12][AB]$")>; + +// Crypto SM4 ops +def : InstRW<[V3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>; + +// §3.23 CRC +// ----------------------------------------------------------------------------- + +def : InstRW<[V3Wr_CRC, V3Rd_CRC], (instregex "^CRC32")>; + +// §3.24 SVE Predicate instructions +// ----------------------------------------------------------------------------- + +// Loop control, based on predicate +def : InstRW<[V3Write_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP, + BRKB_PPmP, BRKB_PPzP)>; + +// Loop control, based on predicate and flag setting +def : InstRW<[V3Write_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>; + +// Loop control, propagating +def : InstRW<[V3Write_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP, + BRKPB_PPzPP)>; + +// Loop control, propagating and flag setting +def : InstRW<[V3Write_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP, + BRKPBS_PPzPP)>; + +// Loop control, based on GPR +def : InstRW<[V3Write_3c_2M], + (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; +def : InstRW<[V3Write_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; + +// Loop terminate +def : InstRW<[V3Write_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; + +// Predicate counting scalar +def : InstRW<[V3Write_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; +def : InstRW<[V3Write_2c_1M], + (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI", + "^SQ(DEC|INC)[BHWD]_XPiWdI", + "^UQ(DEC|INC)[BHWD]_WPiI")>; + +// Predicate counting scalar, ALL, {1,2,4} +def : InstRW<[V3Write_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>; + +// Predicate counting scalar, active predicate +def : InstRW<[V3Write_2c_1M], + (instregex "^CNTP_XPP_[BHSD]", + "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", + "^(UQDEC|UQINC)P_WP_[BHSD]", + "^(SQDEC|SQINC)P_XPWd_[BHSD]")>; + +// Predicate counting vector, active predicate +def : InstRW<[V3Write_7c_1M_1M0_1V], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; + +// Predicate logical +def : InstRW<[V3Write_1or2c_1M], + (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; + +// Predicate logical, flag setting +def : InstRW<[V3Write_1or2c_1M], + (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; + +// Predicate reverse +def : InstRW<[V3Write_2c_1M], (instregex "^REV_PP_[BHSD]")>; + +// Predicate select +def : InstRW<[V3Write_1c_1M], (instrs SEL_PPPP)>; + +// Predicate set +def : InstRW<[V3Write_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; + +// Predicate set/initialize, set flags +def : InstRW<[V3Write_2c_1M], (instregex "^PTRUES_[BHSD]")>; + +// Predicate find first/next +def : InstRW<[V3Write_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; + +// Predicate test +def : InstRW<[V3Write_1c_1M], (instrs PTEST_PP)>; + +// Predicate transpose +def : InstRW<[V3Write_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>; + +// Predicate unpack and widen +def : InstRW<[V3Write_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>; + +// Predicate zip/unzip +def : InstRW<[V3Write_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>; + +// §3.25 SVE integer instructions +// ----------------------------------------------------------------------------- + +// Arithmetic, absolute diff +def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]", + "^[SU]ABD_ZPZZ_[BHSD]")>; + +// Arithmetic, absolute diff accum +def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; + +// Arithmetic, absolute diff accum long +def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, absolute diff long +def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, basic +def : InstRW<[V3Write_2c_1V], + (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]", + "^(ADD|SUB)_ZZZ_[BHSD]", + "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]", + "^(ADD|SUB|SUBR)_ZI_[BHSD]", + "^ADR_[SU]XTW_ZZZ_D_[0123]", + "^ADR_LSL_ZZZ_[SD]_[0123]", + "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", + "^SADDLBT_ZZZ_[HSD]", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", + "^SSUBL(BT|TB)_ZZZ_[HSD]")>; + +// Arithmetic, complex +def : InstRW<[V3Write_2c_1V], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", + "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]", + "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", + "^[SU]Q(ADD|SUB)_ZI_[BHSD]", + "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", + "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; + +// Arithmetic, large integer +def : InstRW<[V3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; + +// Arithmetic, pairwise add +def : InstRW<[V3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>; + +// Arithmetic, pairwise add and accum long +def : InstRW<[V3Wr_ZPA, ReadDefault, V3Rd_ZPA], + (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; + +// Arithmetic, shift +def : InstRW<[V3Write_2c_1V13], + (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", + "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", + "^(ASR|LSL|LSR)_ZPmI_[BHSD]", + "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", + "^(ASR|LSL|LSR)_ZZI_[BHSD]", + "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]", + "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; + +// Arithmetic, shift and accumulate +def : InstRW<[V3Wr_ZSA, V3Rd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>; + +// Arithmetic, shift by immediate +def : InstRW<[V3Write_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]", + "^[SU]SHLL[BT]_ZZI_[HSD]")>; + +// Arithmetic, shift by immediate and insert +def : InstRW<[V3Write_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>; + +// Arithmetic, shift complex +def : InstRW<[V3Write_4c_1V], + (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", + "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]", + "^[SU]QR?SHL_ZPZZ_[BHSD]", + "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", + "^SQSHRU?N[BT]_ZZI_[BHS]", + "^UQR?SHRN[BT]_ZZI_[BHS]")>; + +// Arithmetic, shift right for divide +def : InstRW<[V3Write_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>; + +// Arithmetic, shift rounding +def : InstRW<[V3Write_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]", + "^[SU]RSHL_ZPZZ_[BHSD]", + "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>; + +// Bit manipulation +def : InstRW<[V3Write_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>; + +// Bitwise select +def : InstRW<[V3Write_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; + +// Count/reverse bits +def : InstRW<[V3Write_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>; + +// Broadcast logical bitmask immediate to vector +def : InstRW<[V3Write_2c_1V], (instrs DUPM_ZI)>; + +// Compare and set flags +def : InstRW<[V3Write_2or3c_1V0], + (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", + "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; + +// Complex add +def : InstRW<[V3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>; + +// Complex dot product 8-bit element +def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; + +// Complex dot product 16-bit element +def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; + +// Complex multiply-add B, H, S element size +def : InstRW<[V3Wr_ZCMABHS, V3Rd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]", + "^CMLA_ZZZI_[HS]")>; + +// Complex multiply-add D element size +def : InstRW<[V3Wr_ZCMAD, V3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>; + +// Conditional extract operations, scalar form +def : InstRW<[V3Write_8c_1M0_1V01], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; + +// Conditional extract operations, SIMD&FP scalar and vector forms +def : InstRW<[V3Write_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", + "^COMPACT_ZPZ_[SD]", + "^SPLICE_ZPZZ?_[BHSD]")>; + +// Convert to floating point, 64b to float or convert to double +def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]", + "^[SU]CVTF_ZPmZ_StoD")>; + +// Convert to floating point, 32b to single or half +def : InstRW<[V3Write_4c_2V02], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; + +// Convert to floating point, 16b to half +def : InstRW<[V3Write_6c_4V02], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; + +// Copy, scalar +def : InstRW<[V3Write_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>; + +// Copy, scalar SIMD&FP or imm +def : InstRW<[V3Write_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]", + "^CPY_ZPzI_[BHSD]")>; + +// Divides, 32 bit +def : InstRW<[V3Write_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S", + "^[SU]DIV_ZPZZ_S")>; + +// Divides, 64 bit +def : InstRW<[V3Write_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", + "^[SU]DIV_ZPZZ_D")>; + +// Dot product, 8 bit +def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>; + +// Dot product, 8 bit, using signed and unsigned integers +def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; + +// Dot product, 16 bit +def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>; + +// Duplicate, immediate and indexed form +def : InstRW<[V3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]", + "^DUP_ZZI_[BHSDQ]")>; + +// Duplicate, scalar form +def : InstRW<[V3Write_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>; + +// Extend, sign or zero +def : InstRW<[V3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]", + "^[SU]XTH_ZPmZ_[SD]", + "^[SU]XTW_ZPmZ_[D]")>; + +// Extract +def : InstRW<[V3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; + +// Extract narrow saturating +def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", + "^SQXTUN[BT]_ZZ_[BHS]")>; + +// Extract operation, SIMD and FP scalar form +def : InstRW<[V3Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>; + +// Extract operation, scalar +def : InstRW<[V3Write_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>; + +// Histogram operations +def : InstRW<[V3Write_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]", + "^HISTSEG_ZZZ")>; + +// Horizontal operations, B, H, S form, immediate operands only +def : InstRW<[V3Write_4c_1V02], (instregex "^INDEX_II_[BHS]")>; + +// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar +// operands only / immediate, scalar operands +def : InstRW<[V3Write_7c_1M0_1V02], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; + +// Horizontal operations, D form, immediate operands only +def : InstRW<[V3Write_5c_2V02], (instrs INDEX_II_D)>; + +// Horizontal operations, D form, scalar, immediate operands)/ scalar operands +// only / immediate, scalar operands +def : InstRW<[V3Write_8c_2M0_2V02], (instregex "^INDEX_(IR|RI|RR)_D")>; + +// insert operation, SIMD and FP scalar form +def : InstRW<[V3Write_2c_1V], (instregex "^INSR_ZV_[BHSD]")>; + +// insert operation, scalar +def : InstRW<[V3Write_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>; + +// Logical +def : InstRW<[V3Write_2c_1V], + (instregex "^(AND|EOR|ORR)_ZI", + "^(AND|BIC|EOR|ORR)_ZZZ", + "^EOR(BT|TB)_ZZZ_[BHSD]", + "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]", + "^NOT_ZPmZ_[BHSD]")>; + +// Max/min, basic and pairwise +def : InstRW<[V3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", + "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]", + "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>; + +// Matching operations +// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the +// latency for this instruction is 4 cycles. +def : InstRW<[V3Write_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>; + +// Matrix multiply-accumulate +def : InstRW<[V3Wr_ZMMA, V3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; + +// Move prefix +def : InstRW<[V3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", + "^MOVPRFX_ZZ")>; + +// Multiply, B, H, S element size +def : InstRW<[V3Write_4c_1V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", + "^MUL_ZPZZ_[BHS]", + "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]", + "^[SU]MULH_ZPZZ_[BHS]")>; + +// Multiply, D element size +def : InstRW<[V3Write_5c_2V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", + "^MUL_ZPZZ_D", + "^[SU]MULH_(ZPmZ|ZZZ)_D", + "^[SU]MULH_ZPZZ_D")>; + +// Multiply long +def : InstRW<[V3Write_4c_1V02], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", + "^[SU]MULL[BT]_ZZZ_[HSD]")>; + +// Multiply accumulate, B, H, S element size +def : InstRW<[V3Wr_ZMABHS, V3Rd_ZMABHS], + (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>; +def : InstRW<[V3Wr_ZMABHS, ReadDefault, V3Rd_ZMABHS], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; + +// Multiply accumulate, D element size +def : InstRW<[V3Wr_ZMAD, V3Rd_ZMAD], + (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>; +def : InstRW<[V3Wr_ZMAD, ReadDefault, V3Rd_ZMAD], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; + +// Multiply accumulate long +def : InstRW<[V3Wr_ZMAL, V3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", + "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; + +// Multiply accumulate saturating doubling long regular +def : InstRW<[V3Wr_ZMASQL, V3Rd_ZMASQ], + (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]", + "^SQDML[AS]L[BT]_ZZZI_[SD]")>; + +// Multiply saturating doubling high, B, H, S element size +def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULH_ZZZ_[BHS]", + "^SQDMULH_ZZZI_[HS]")>; + +// Multiply saturating doubling high, D element size +def : InstRW<[V3Write_5c_2V02], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; + +// Multiply saturating doubling long +def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", + "^SQDMULL[BT]_ZZZI_[SD]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, B, H, S +// element size +def : InstRW<[V3Wr_ZMASQBHS, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", + "^SQRDCMLAH_ZZZ_[BHS]", + "^SQRDML[AS]H_ZZZI_[HS]", + "^SQRDCMLAH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, D element +// size +def : InstRW<[V3Wr_ZMASQD, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D", + "^SQRDCMLAH_ZZZ_D")>; + +// Multiply saturating rounding doubling regular/complex, B, H, S element size +def : InstRW<[V3Write_4c_1V02], (instregex "^SQRDMULH_ZZZ_[BHS]", + "^SQRDMULH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex, D element size +def : InstRW<[V3Write_5c_2V02], (instregex "^SQRDMULH_ZZZI?_D")>; + +// Multiply/multiply long, (8x8) polynomial +def : InstRW<[V3Write_2c_1V], (instregex "^PMUL_ZZZ_B", + "^PMULL[BT]_ZZZ_[HDQ]")>; + +// Predicate counting vector +def : InstRW<[V3Write_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>; + +// Reciprocal estimate +def : InstRW<[V3Write_4c_2V02], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; + +// Reduction, arithmetic, B form +def : InstRW<[V3Write_9c_2V_4V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; + +// Reduction, arithmetic, H form +def : InstRW<[V3Write_8c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; + +// Reduction, arithmetic, S form +def : InstRW<[V3Write_6c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; + +// Reduction, arithmetic, D form +def : InstRW<[V3Write_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; + +// Reduction, logical +def : InstRW<[V3Write_6c_1V_1V13], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>; + +// Reverse, vector +def : InstRW<[V3Write_2c_1V], (instregex "^REV_ZZ_[BHSD]", + "^REVB_ZPmZ_[HSD]", + "^REVH_ZPmZ_[SD]", + "^REVW_ZPmZ_D")>; + +// Select, vector form +def : InstRW<[V3Write_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>; + +// Table lookup +def : InstRW<[V3Write_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>; + +// Table lookup extension +def : InstRW<[V3Write_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>; + +// Transpose, vector form +def : InstRW<[V3Write_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; + +// Unpack and extend +def : InstRW<[V3Write_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; + +// Zip/unzip +def : InstRW<[V3Write_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; + +// §3.26 SVE floating-point instructions +// ----------------------------------------------------------------------------- + +// Floating point absolute value/difference +def : InstRW<[V3Write_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]", + "^FABD_ZPZZ_[HSD]", + "^FABS_ZPmZ_[HSD]")>; + +// Floating point arithmetic +def : InstRW<[V3Write_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]", + "^F(ADD|SUB)_ZPZ[IZ]_[HSD]", + "^FADDP_ZPmZZ_[HSD]", + "^FNEG_ZPmZ_[HSD]", + "^FSUBR_ZPm[IZ]_[HSD]", + "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>; + +// Floating point associative add, F16 +def : InstRW<[V3Write_10c_1V1_9rc], (instrs FADDA_VPZ_H)>; + +// Floating point associative add, F32 +def : InstRW<[V3Write_6c_1V1_5rc], (instrs FADDA_VPZ_S)>; + +// Floating point associative add, F64 +def : InstRW<[V3Write_4c_1V], (instrs FADDA_VPZ_D)>; + +// Floating point compare +def : InstRW<[V3Write_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]", + "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", + "^FCM(LE|LT)_PPzZ0_[HSD]", + "^FCMUO_PPzZZ_[HSD]")>; + +// Floating point complex add +def : InstRW<[V3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>; + +// Floating point complex multiply add +def : InstRW<[V3Wr_ZFCMA, ReadDefault, V3Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[V3Wr_ZFCMA, V3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; + +// Floating point convert, long or narrow (F16 to F32 or F32 to F16) +def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", + "^FCVTLT_ZPmZ_HtoS", + "^FCVTNT_ZPmZ_StoH")>; + +// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 +// or F64 to F16) +def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", + "^FCVTLT_ZPmZ_StoD", + "^FCVTNT_ZPmZ_DtoS")>; + +// Floating point convert, round to odd +def : InstRW<[V3Write_3c_1V02], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>; + +// Floating point base2 log, F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; + +// Floating point base2 log, F32 +def : InstRW<[V3Write_4c_2V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; + +// Floating point base2 log, F64 +def : InstRW<[V3Write_3c_1V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; + +// Floating point convert to integer, F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; + +// Floating point convert to integer, F32 +def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; + +// Floating point convert to integer, F64 +def : InstRW<[V3Write_3c_1V02], + (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; + +// Floating point copy +def : InstRW<[V3Write_2c_1V], (instregex "^FCPY_ZPmI_[HSD]", + "^FDUP_ZI_[HSD]")>; + +// Floating point divide, F16 +def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; + +// Floating point divide, F32 +def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; + +// Floating point divide, F64 +def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; + +// Floating point min/max pairwise +def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; + +// Floating point min/max +def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]", + "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>; + +// Floating point multiply +def : InstRW<[V3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", + "^FMULX_ZPZZ_[HSD]", + "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]", + "^FMUL_ZPZ[IZ]_[HSD]")>; + +// Floating point multiply accumulate +def : InstRW<[V3Wr_ZFMA, ReadDefault, V3Rd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[V3Wr_ZFMA, V3Rd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; + +// Floating point multiply add/sub accumulate long +def : InstRW<[V3Wr_ZFMAL, V3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; + +// Floating point reciprocal estimate, F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>; + +// Floating point reciprocal estimate, F32 +def : InstRW<[V3Write_4c_2V02], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>; + +// Floating point reciprocal estimate, F64 +def : InstRW<[V3Write_3c_1V02], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>; + +// Floating point reciprocal step +def : InstRW<[V3Write_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; + +// Floating point reduction, F16 +def : InstRW<[V3Write_8c_4V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>; + +// Floating point reduction, F32 +def : InstRW<[V3Write_6c_3V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>; + +// Floating point reduction, F64 +def : InstRW<[V3Write_4c_2V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>; + +// Floating point round to integral, F16 +def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; + +// Floating point round to integral, F32 +def : InstRW<[V3Write_4c_2V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; + +// Floating point round to integral, F64 +def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; + +// Floating point square root, F16 +def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>; + +// Floating point square root, F32 +def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>; + +// Floating point square root, F64 +def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>; + +// Floating point trigonometric exponentiation +def : InstRW<[V3Write_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>; + +// Floating point trigonometric multiply add +def : InstRW<[V3Write_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>; + +// Floating point trigonometric, miscellaneous +def : InstRW<[V3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>; + +// §3.27 SVE BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// Convert, F32 to BF16 +def : InstRW<[V3Write_4c_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; + +// Dot product +def : InstRW<[V3Wr_ZBFDOT, V3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; + +// Matrix multiply accumulate +def : InstRW<[V3Wr_ZBFMMA, V3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>; + +// Multiply accumulate long +def : InstRW<[V3Wr_ZBFMAL, V3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>; + +// §3.28 SVE Load instructions +// ----------------------------------------------------------------------------- + +// Load vector +def : InstRW<[V3Write_6c_1L], (instrs LDR_ZXI)>; + +// Load predicate +def : InstRW<[V3Write_6c_1L_1M], (instrs LDR_PXI)>; + +// Contiguous load, scalar + imm +def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]_IMM$", + "^LD1S?B_[HSD]_IMM$", + "^LD1S?H_[SD]_IMM$", + "^LD1S?W_D_IMM$" )>; +// Contiguous load, scalar + scalar +def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]$", + "^LD1S?B_[HSD]$", + "^LD1S?H_[SD]$", + "^LD1S?W_D$" )>; + +// Contiguous load broadcast, scalar + imm +def : InstRW<[V3Write_6c_1L], (instregex "^LD1R[BHWD]_IMM$", + "^LD1RS?B_[HSD]_IMM$", + "^LD1RS?H_[SD]_IMM$", + "^LD1RW_D_IMM$", + "^LD1RSW_IMM$", + "^LD1RQ_[BHWD]_IMM$")>; + +// Contiguous load broadcast, scalar + scalar +def : InstRW<[V3Write_6c_1L], (instregex "^LD1RQ_[BHWD]$")>; + +// Non temporal load, scalar + imm +// Non temporal load, scalar + scalar +def : InstRW<[V3Write_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>; + +// Non temporal gather load, vector + scalar 32-bit element size +def : InstRW<[V3Write_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$", + "^LDNT1S[BH]_ZZR_S$")>; + +// Non temporal gather load, vector + scalar 64-bit element size +def : InstRW<[V3Write_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>; +def : InstRW<[V3Write_9c_2L_2V], (instrs LDNT1D_ZZR_D)>; + +// Contiguous first faulting load, scalar + scalar +def : InstRW<[V3Write_6c_1L_1I], (instregex "^LDFF1[BHWD]$", + "^LDFF1S?B_[HSD]$", + "^LDFF1S?H_[SD]$", + "^LDFF1S?W_D$")>; + +// Contiguous non faulting load, scalar + imm +def : InstRW<[V3Write_6c_1L], (instregex "^LDNF1[BHWD]_IMM$", + "^LDNF1S?B_[HSD]_IMM$", + "^LDNF1S?H_[SD]_IMM$", + "^LDNF1S?W_D_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + imm +def : InstRW<[V3Write_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + scalar +def : InstRW<[V3Write_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>; + +// Contiguous Load three structures to three vectors, scalar + imm +def : InstRW<[V3Write_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>; + +// Contiguous Load three structures to three vectors, scalar + scalar +def : InstRW<[V3Write_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>; + +// Contiguous Load four structures to four vectors, scalar + imm +def : InstRW<[V3Write_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>; + +// Contiguous Load four structures to four vectors, scalar + scalar +def : InstRW<[V3Write_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>; + +// Gather load, vector + imm, 32-bit element size +def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$", + "^GLD(FF)?1W_IMM$")>; + +// Gather load, vector + imm, 64-bit element size +def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$", + "^GLD(FF)?1D_IMM$")>; + +// Gather load, 32-bit scaled offset +def : InstRW<[V3Write_10c_1L_8V], + (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$", + "^GLD(FF)?1W_[SU]XTW_SCALED")>; + +// Gather load, 64-bit scaled offset +// NOTE: These instructions are not specified in the SOG. +def : InstRW<[V3Write_10c_1L_4V], + (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$", + "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>; + +// Gather load, 32-bit unpacked unscaled offset +def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", + "^GLD(FF)?1W_[SU]XTW$")>; + +// Gather load, 64-bit unpacked unscaled offset +// NOTE: These instructions are not specified in the SOG. +def : InstRW<[V3Write_9c_1L_2V], + (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$", + "^GLD(FF)?1D(_[SU]XTW)?$")>; + +// §3.29 SVE Store instructions +// ----------------------------------------------------------------------------- + +// Store from predicate reg +def : InstRW<[V3Write_1c_1SA], (instrs STR_PXI)>; + +// Store from vector reg +def : InstRW<[V3Write_2c_1SA_1V01], (instrs STR_ZXI)>; + +// Contiguous store, scalar + imm +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BHWD]_IMM$", + "^ST1B_[HSD]_IMM$", + "^ST1H_[SD]_IMM$", + "^ST1W_D_IMM$")>; + +// Contiguous store, scalar + scalar +def : InstRW<[V3Write_2c_1SA_1I_1V01], (instregex "^ST1H(_[SD])?$")>; +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BWD]$", + "^ST1B_[HSD]$", + "^ST1W_D$")>; + +// Contiguous store two structures from two vectors, scalar + imm +def : InstRW<[V3Write_4c_1SA_1V01], (instregex "^ST2[BHWD]_IMM$")>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[V3Write_4c_2SA_2I_2V01], (instrs ST2H)>; +def : InstRW<[V3Write_4c_2SA_2V01], (instregex "^ST2[BWD]$")>; + +// Contiguous store three structures from three vectors, scalar + imm +def : InstRW<[V3Write_7c_9SA_9V01], (instregex "^ST3[BHWD]_IMM$")>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[V3Write_7c_9SA_9I_9V01], (instregex "^ST3[BHWD]$")>; + +// Contiguous store four structures from four vectors, scalar + imm +def : InstRW<[V3Write_11c_18SA_18V01], (instregex "^ST4[BHWD]_IMM$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[V3Write_11c_18SA_18I_18V01], (instregex "^ST4[BHWD]$")>; + +// Non temporal store, scalar + imm +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BHWD]_ZRI$")>; + +// Non temporal store, scalar + scalar +def : InstRW<[V3Write_2c_1SA_1I_1V01], (instrs STNT1H_ZRR)>; +def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BWD]_ZRR$")>; + +// Scatter non temporal store, vector + scalar 32-bit element size +def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^STNT1[BHW]_ZZR_S")>; + +// Scatter non temporal store, vector + scalar 64-bit element size +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^STNT1[BHWD]_ZZR_D")>; + +// Scatter store vector + imm 32-bit element size +def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_IMM$", + "^SST1W_IMM$")>; + +// Scatter store vector + imm 64-bit element size +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_IMM$", + "^SST1D_IMM$")>; + +// Scatter store, 32-bit scaled offset +def : InstRW<[V3Write_4c_6SA_6V01], + (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unpacked unscaled offset +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_[SU]XTW$", + "^SST1D_[SU]XTW$")>; + +// Scatter store, 32-bit unpacked scaled offset +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", + "^SST1D_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unscaled offset +def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_[SU]XTW$", + "^SST1W_[SU]XTW$")>; + +// Scatter store, 64-bit scaled offset +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_SCALED$", + "^SST1D_SCALED$")>; + +// Scatter store, 64-bit unscaled offset +def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D$", + "^SST1D$")>; + +// §3.30 SVE Miscellaneous instructions +// ----------------------------------------------------------------------------- + +// Read first fault register, unpredicated +def : InstRW<[V3Write_2c_1M0], (instrs RDFFR_P)>; + +// Read first fault register, predicated +def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFR_PPz)>; + +// Read first fault register and set flags +def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFRS_PPz)>; + +// Set first fault register +// Write to first fault register +def : InstRW<[V3Write_2c_1M0], (instrs SETFFR, WRFFR)>; + +// Prefetch +// NOTE: This is not specified in the SOG. +def : InstRW<[V3Write_4c_1L], (instregex "^PRF[BHWD]")>; + +// §3.31 SVE Cryptographic instructions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]_ZZZ_B$", + "^AESI?MC_ZZ_B$")>; + +// Crypto SHA3 ops +def : InstRW<[V3Write_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$", + "^RAX1_ZZZ_D$", + "^XAR_ZZZI_[BHSD]$")>; + +// Crypto SM4 ops +def : InstRW<[V3Write_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>; + +} diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td new file mode 100644 index 0000000..0f1ec66 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td @@ -0,0 +1,2705 @@ +//=- AArch64SchedNeoverseV3AE.td - NeoverseV3AE Scheduling Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Arm Neoverse V3AE processors. +// All information is taken from the V3AE Software Optimisation guide: +// +// https://developer.arm.com/documentation/109703/300/?lang=en +// +//===----------------------------------------------------------------------===// + +def NeoverseV3AEModel : SchedMachineModel { + let IssueWidth = 10; // Expect best value to be slightly higher than V2 + let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2 + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2. + let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, + [HasSVE2p1, HasSVEB16B16, + HasCPA, HasCSSC]); +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Neoverse V3AE. +// Instructions are first fetched and then decoded into internal macro-ops +// (MOPs). From there, the MOPs proceed through register renaming and dispatch +// stages. A MOP can be split into two micro-ops further down the pipeline +// after the decode stage. Once dispatched, micro-ops wait for their operands +// and issue out-of-order to one of nineteen issue pipelines. Each issue +// pipeline can accept one micro-op per cycle. + +let SchedModel = NeoverseV3AEModel in { + +// Define the (19) issue ports. +def V3AEUnitB : ProcResource<3>; // Branch 0/1/2 +def V3AEUnitS0 : ProcResource<1>; // Integer single-cycle 0 +def V3AEUnitS1 : ProcResource<1>; // Integer single-cycle 1 +def V3AEUnitS2 : ProcResource<1>; // Integer single-cycle 2 +def V3AEUnitS3 : ProcResource<1>; // Integer single-cycle 3 +def V3AEUnitS4 : ProcResource<1>; // Integer single-cycle 4 +def V3AEUnitS5 : ProcResource<1>; // Integer single-cycle 5 +def V3AEUnitM0 : ProcResource<1>; // Integer single/multicycle 0 +def V3AEUnitM1 : ProcResource<1>; // Integer single/multicycle 1 +def V3AEUnitV0 : ProcResource<1>; // FP/ASIMD 0 +def V3AEUnitV1 : ProcResource<1>; // FP/ASIMD 1 +def V3AEUnitLS0 : ProcResource<1>; // Load/Store 0 +def V3AEUnitL12 : ProcResource<2>; // Load 1/2 +def V3AEUnitST1 : ProcResource<1>; // Store 1 +def V3AEUnitD : ProcResource<2>; // Store data 0/1 +def V3AEUnitFlg : ProcResource<4>; // Flags + +def V3AEUnitS : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5]>; // Integer single-cycle 0/1/2/3/4/5 +def V3AEUnitI : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5, V3AEUnitM0, V3AEUnitM1]>; // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1 +def V3AEUnitM : ProcResGroup<[V3AEUnitM0, V3AEUnitM1]>; // Integer single/multicycle 0/1 +def V3AEUnitLSA : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12, V3AEUnitST1]>; // Supergroup of L+SA +def V3AEUnitL : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12]>; // Load/Store 0 and Load 1/2 +def V3AEUnitSA : ProcResGroup<[V3AEUnitLS0, V3AEUnitST1]>; // Load/Store 0 and Store 1 +def V3AEUnitV : ProcResGroup<[V3AEUnitV0, V3AEUnitV1]>; // FP/ASIMD 0/1 + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadVLD, 0>; + +// NOTE: Copied from N2. +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Neoverse V3AE. + +//===----------------------------------------------------------------------===// + +// Define generic 0 micro-op types +def V3AEWrite_0c : SchedWriteRes<[]> { let Latency = 0; } + +// Define generic 1 micro-op types + +def V3AEWrite_1c_1B : SchedWriteRes<[V3AEUnitB]> { let Latency = 1; } +def V3AEWrite_1c_1F_1Flg : SchedWriteRes<[V3AEUnitI, V3AEUnitFlg]> { let Latency = 1; } +def V3AEWrite_1c_1I : SchedWriteRes<[V3AEUnitI]> { let Latency = 1; } +def V3AEWrite_1c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 1; } +def V3AEWrite_1c_1SA : SchedWriteRes<[V3AEUnitSA]> { let Latency = 1; } +def V3AEWrite_2c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 2; } +def V3AEWrite_2c_1M_1Flg : SchedWriteRes<[V3AEUnitM, V3AEUnitFlg]> { let Latency = 2; } +def V3AEWrite_3c_1M : SchedWriteRes<[V3AEUnitM]> { let Latency = 3; } +def V3AEWrite_2c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 2; } +def V3AEWrite_3c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 3; } +def V3AEWrite_4c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 4; } +def V3AEWrite_12c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 12; + let ReleaseAtCycles = [12]; } +def V3AEWrite_20c_1M0 : SchedWriteRes<[V3AEUnitM0]> { let Latency = 20; + let ReleaseAtCycles = [20]; } +def V3AEWrite_4c_1L : SchedWriteRes<[V3AEUnitL]> { let Latency = 4; } +def V3AEWrite_6c_1L : SchedWriteRes<[V3AEUnitL]> { let Latency = 6; } +def V3AEWrite_2c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 2; } +def V3AEWrite_2c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 2; } +def V3AEWrite_3c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AEWrite_4c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AEWrite_5c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AEWrite_6c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; } +def V3AEWrite_12c_1V : SchedWriteRes<[V3AEUnitV]> { let Latency = 12; } +def V3AEWrite_3c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; } +def V3AEWrite_4c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AEWrite_9c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 9; } +def V3AEWrite_10c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 10; } +def V3AEWrite_8c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 8; } +def V3AEWrite_12c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 12; + let ReleaseAtCycles = [11]; } +def V3AEWrite_13c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 13; } +def V3AEWrite_15c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 15; } +def V3AEWrite_13c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13; + let ReleaseAtCycles = [8]; } +def V3AEWrite_16c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 16; } +def V3AEWrite_20c_1V0 : SchedWriteRes<[V3AEUnitV0]> { let Latency = 20; + let ReleaseAtCycles = [20]; } +def V3AEWrite_2c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 2; } +def V3AEWrite_3c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 3; } +def V3AEWrite_4c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 4; } +def V3AEWrite_6c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 6; } +def V3AEWrite_10c_1V1 : SchedWriteRes<[V3AEUnitV1]> { let Latency = 10; } +def V3AEWrite_6c_1SA : SchedWriteRes<[V3AEUnitSA]> { let Latency = 6; } + +//===----------------------------------------------------------------------===// +// Define generic 2 micro-op types + +def V3AEWrite_1c_1B_1S : SchedWriteRes<[V3AEUnitB, V3AEUnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1M0_1B : SchedWriteRes<[V3AEUnitM0, V3AEUnitB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_9c_1M0_1L : SchedWriteRes<[V3AEUnitM0, V3AEUnitL]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def V3AEWrite_3c_1I_1M : SchedWriteRes<[V3AEUnitI, V3AEUnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3AEWrite_1c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3AEWrite_3c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3AEWrite_4c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3AEWrite_5c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_7c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def V3AEWrite_1c_1SA_1D : SchedWriteRes<[V3AEUnitSA, V3AEUnitD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3AEWrite_5c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3AEWrite_2c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3AEWrite_2c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3AEWrite_5c_1V1_1V : SchedWriteRes<[V3AEUnitV1, V3AEUnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3AEWrite_4c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3AEWrite_4c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_2L : SchedWriteRes<[V3AEUnitL, V3AEUnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_8c_1L_1V : SchedWriteRes<[V3AEUnitL, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def V3AEWrite_4c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3AEWrite_3c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3AEWrite_4c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def V3AEWrite_1c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def V3AEWrite_2c_1M0_1M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_2V1 : SchedWriteRes<[V3AEUnitV1, V3AEUnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_5c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3AEWrite_5c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_7c_1M0_1V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitV0]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def V3AEWrite_2c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def V3AEWrite_3c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1V_1V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1L_1M : SchedWriteRes<[V3AEUnitL, V3AEUnitM]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_6c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def V3AEWrite_8c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define generic 3 micro-op types + +def V3AEWrite_1c_1SA_1D_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitD, V3AEUnitI]> { + let Latency = 1; + let NumMicroOps = 3; +} + +def V3AEWrite_2c_1SA_1V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitI]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3AEWrite_2c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3AEWrite_4c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def V3AEWrite_9c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def V3AEWrite_4c_3V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def V3AEWrite_7c_1M_1M0_1V : SchedWriteRes<[V3AEUnitM, V3AEUnitM0, V3AEUnitV]> { + let Latency = 7; + let NumMicroOps = 3; +} + +def V3AEWrite_2c_1SA_1I_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitI, V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def V3AEWrite_6c_3L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def V3AEWrite_6c_3V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def V3AEWrite_8c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define generic 4 micro-op types + +def V3AEWrite_2c_1SA_2V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitI]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def V3AEWrite_5c_1I_3L : SchedWriteRes<[V3AEUnitI, V3AEUnitL, V3AEUnitL, V3AEUnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def V3AEWrite_6c_4V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0, V3AEUnitV0, V3AEUnitV0]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3AEWrite_8c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3AEWrite_6c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1, + V3AEUnitV1]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3AEWrite_6c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3AEWrite_8c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3AEWrite_9c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def V3AEWrite_2c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV, + V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def V3AEWrite_4c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV, + V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def V3AEWrite_8c_2M0_2V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitV0, + V3AEUnitV0]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3AEWrite_8c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1, + V3AEUnitV1]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def V3AEWrite_4c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM, + V3AEUnitM]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def V3AEWrite_5c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM, + V3AEUnitM]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def V3AEWrite_6c_2I_2L : SchedWriteRes<[V3AEUnitI, V3AEUnitI, V3AEUnitL, V3AEUnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def V3AEWrite_7c_4L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def V3AEWrite_6c_1SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define generic 5 micro-op types + +def V3AEWrite_2c_1SA_2V_2I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitI, V3AEUnitI]> { + let Latency = 2; + let NumMicroOps = 5; +} + +def V3AEWrite_8c_2L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def V3AEWrite_9c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def V3AEWrite_10c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 10; + let NumMicroOps = 5; +} + +def V3AEWrite_6c_5V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define generic 6 micro-op types + +def V3AEWrite_8c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def V3AEWrite_9c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3AEWrite_9c_2L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3AEWrite_9c_2L_2V_2I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, + V3AEUnitV, V3AEUnitI, V3AEUnitI]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3AEWrite_9c_2V_4V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1, + V3AEUnitV1, V3AEUnitV1, V3AEUnitV1]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def V3AEWrite_2c_3SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 6; +} + +def V3AEWrite_4c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def V3AEWrite_5c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def V3AEWrite_4c_2SA_2I_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitI, + V3AEUnitI, V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define generic 7 micro-op types + +def V3AEWrite_8c_3L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 7; +} + +//===----------------------------------------------------------------------===// +// Define generic 8 micro-op types + +def V3AEWrite_2c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 2; + let NumMicroOps = 8; +} + +def V3AEWrite_4c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV]> { + let Latency = 4; + let NumMicroOps = 8; +} + +def V3AEWrite_6c_2SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 8; +} + +def V3AEWrite_8c_4L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 8; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define generic 9 micro-op types + +def V3AEWrite_6c_3SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def V3AEWrite_10c_1L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 10; + let NumMicroOps = 9; +} + +def V3AEWrite_10c_3V_3L_3I : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitI, V3AEUnitI, V3AEUnitI]> { + let Latency = 10; + let NumMicroOps = 9; +} + +//===----------------------------------------------------------------------===// +// Define generic 10 micro-op types + +def V3AEWrite_9c_6L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define generic 12 micro-op types + +def V3AEWrite_5c_4SA_8V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 5; + let NumMicroOps = 12; +} + +def V3AEWrite_9c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitL, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 9; + let NumMicroOps = 12; +} + +def V3AEWrite_10c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitL, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 10; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define generic 16 micro-op types + +def V3AEWrite_7c_4SA_12V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 7; + let NumMicroOps = 16; +} + +def V3AEWrite_10c_4L_8V_4I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, + V3AEUnitL, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI]> { + let Latency = 10; + let NumMicroOps = 16; +} + +//===----------------------------------------------------------------------===// +// Define generic 18 micro-op types + +def V3AEWrite_7c_9SA_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 7; + let NumMicroOps = 18; +} + +//===----------------------------------------------------------------------===// +// Define generic 27 micro-op types + +def V3AEWrite_7c_9SA_9I_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 7; + let NumMicroOps = 27; +} + +//===----------------------------------------------------------------------===// +// Define generic 36 micro-op types + +def V3AEWrite_11c_18SA_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, V3AEUnitSA, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV]> { + let Latency = 11; + let NumMicroOps = 36; +} + +//===----------------------------------------------------------------------===// +// Define generic 54 micro-op types + +def V3AEWrite_11c_18SA_18I_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitSA, V3AEUnitSA, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitI, V3AEUnitI, V3AEUnitI, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, V3AEUnitV, + V3AEUnitV, V3AEUnitV, + V3AEUnitV]> { + let Latency = 11; + let NumMicroOps = 54; +} + +//===----------------------------------------------------------------------===// +// Define predicate-controlled types + +def V3AEWrite_ArithI : SchedWriteVariant<[ + SchedVar<IsCheapLSL, [V3AEWrite_1c_1I]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>; + +def V3AEWrite_ArithF : SchedWriteVariant<[ + SchedVar<IsCheapLSL, [V3AEWrite_1c_1F_1Flg]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1M_1Flg]>]>; + +def V3AEWrite_Logical : SchedWriteVariant<[ + SchedVar<NeoverseNoLSL, [V3AEWrite_1c_1F_1Flg]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1M_1Flg]>]>; + +def V3AEWrite_Extr : SchedWriteVariant<[ + SchedVar<IsRORImmIdiomPred, [V3AEWrite_1c_1I]>, + SchedVar<NoSchedPred, [V3AEWrite_3c_1I_1M]>]>; + +def V3AEWrite_LdrHQ : SchedWriteVariant<[ + SchedVar<NeoverseHQForm, [V3AEWrite_7c_1I_1L]>, + SchedVar<NoSchedPred, [V3AEWrite_6c_1L]>]>; + +def V3AEWrite_StrHQ : SchedWriteVariant<[ + SchedVar<NeoverseHQForm, [V3AEWrite_2c_1SA_1V_1I]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1SA_1V]>]>; + +def V3AEWrite_0or1c_1I : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>, + SchedVar<NoSchedPred, [V3AEWrite_1c_1I]>]>; + +def V3AEWrite_0or2c_1V : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1V]>]>; + +def V3AEWrite_0or3c_1M0 : SchedWriteVariant<[ + SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>, + SchedVar<NoSchedPred, [V3AEWrite_3c_1M0]>]>; + +def V3AEWrite_2or3c_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1M]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>; + +def V3AEWrite_1or2c_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3AEWrite_2c_1M]>, + SchedVar<NoSchedPred, [V3AEWrite_1c_1M]>]>; + +def V3AEWrite_3or4c_1M0_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3AEWrite_4c_1M0_1M]>, + SchedVar<NoSchedPred, [V3AEWrite_3c_1M0_1M]>]>; + +def V3AEWrite_2or3c_1V0 : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1V0]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1V0]>]>; + +def V3AEWrite_2or3c_1V0_1M : SchedWriteVariant<[ + SchedVar<NeoversePdIsPg, [V3AEWrite_3c_1V0_1M]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1V0_1M]>]>; + +def V3AEWrite_IncDec : SchedWriteVariant<[ + SchedVar<NeoverseCheapIncDec, [V3AEWrite_1c_1I]>, + SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>; + +//===----------------------------------------------------------------------===// +// Define forwarded types + +// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? +def V3AEWr_IM : SchedWriteRes<[V3AEUnitM]> { let Latency = 2; } + +def V3AEWr_FMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_FMA : SchedReadAdvance<2, [WriteFMul, V3AEWr_FMA]>; + +def V3AEWr_VA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VA : SchedReadAdvance<3, [V3AEWr_VA]>; + +def V3AEWr_VDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AERd_VDOT : SchedReadAdvance<2, [V3AEWr_VDOT]>; + +def V3AEWr_VMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AERd_VMMA : SchedReadAdvance<2, [V3AEWr_VMMA]>; + +def V3AEWr_VMA : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AERd_VMA : SchedReadAdvance<3, [V3AEWr_VMA]>; + +def V3AEWr_VMAH : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 4; } +def V3AERd_VMAH : SchedReadAdvance<2, [V3AEWr_VMAH]>; + +def V3AEWr_VMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AERd_VMAL : SchedReadAdvance<3, [V3AEWr_VMAL]>; + +def V3AEWr_VPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VPA : SchedReadAdvance<3, [V3AEWr_VPA]>; + +def V3AEWr_VSA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VSA : SchedReadAdvance<3, [V3AEWr_VSA]>; + +def V3AEWr_VFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VFCMA : SchedReadAdvance<2, [V3AEWr_VFCMA]>; + +def V3AEWr_VFM : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AEWr_VFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VFMA : SchedReadAdvance<2, [V3AEWr_VFM, V3AEWr_VFMA]>; + +def V3AEWr_VFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_VFMAL : SchedReadAdvance<2, [V3AEWr_VFMAL]>; + +def V3AEWr_VBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AERd_VBFDOT : SchedReadAdvance<2, [V3AEWr_VBFDOT]>; +def V3AEWr_VBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; } +def V3AERd_VBFMMA : SchedReadAdvance<2, [V3AEWr_VBFMMA]>; +def V3AEWr_VBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AERd_VBFMAL : SchedReadAdvance<3, [V3AEWr_VBFMAL]>; + +def V3AEWr_CRC : SchedWriteRes<[V3AEUnitM0]> { let Latency = 2; } +def V3AERd_CRC : SchedReadAdvance<1, [V3AEWr_CRC]>; + +def V3AEWr_ZA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_ZA : SchedReadAdvance<3, [V3AEWr_ZA]>; +def V3AEWr_ZPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_ZPA : SchedReadAdvance<3, [V3AEWr_ZPA]>; +def V3AEWr_ZSA : SchedWriteRes<[V3AEUnitV1]> { let Latency = 4; } +def V3AERd_ZSA : SchedReadAdvance<3, [V3AEWr_ZSA]>; + +def V3AEWr_ZDOTB : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AERd_ZDOTB : SchedReadAdvance<2, [V3AEWr_ZDOTB]>; +def V3AEWr_ZDOTH : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; } +def V3AERd_ZDOTH : SchedReadAdvance<2, [V3AEWr_ZDOTH]>; + +// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce +// throughput to 1 in case of forwarding? +def V3AEWr_ZCMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AERd_ZCMABHS : SchedReadAdvance<3, [V3AEWr_ZCMABHS]>; +def V3AEWr_ZCMAD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; } +def V3AERd_ZCMAD : SchedReadAdvance<2, [V3AEWr_ZCMAD]>; + +def V3AEWr_ZMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; } +def V3AERd_ZMMA : SchedReadAdvance<2, [V3AEWr_ZMMA]>; + +def V3AEWr_ZMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AERd_ZMABHS : SchedReadAdvance<3, [V3AEWr_ZMABHS]>; +def V3AEWr_ZMAD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; } +def V3AERd_ZMAD : SchedReadAdvance<2, [V3AEWr_ZMAD]>; + +def V3AEWr_ZMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AERd_ZMAL : SchedReadAdvance<3, [V3AEWr_ZMAL]>; + +def V3AEWr_ZMASQL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AEWr_ZMASQBHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; } +def V3AEWr_ZMASQD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; } +def V3AERd_ZMASQ : SchedReadAdvance<2, [V3AEWr_ZMASQL, V3AEWr_ZMASQBHS, + V3AEWr_ZMASQD]>; + +def V3AEWr_ZFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AERd_ZFCMA : SchedReadAdvance<3, [V3AEWr_ZFCMA]>; + +def V3AEWr_ZFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_ZFMA : SchedReadAdvance<2, [V3AEWr_ZFMA]>; + +def V3AEWr_ZFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; } +def V3AERd_ZFMAL : SchedReadAdvance<2, [V3AEWr_ZFMAL]>; + +def V3AEWr_ZBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AERd_ZBFDOT : SchedReadAdvance<2, [V3AEWr_ZBFDOT]>; +def V3AEWr_ZBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; } +def V3AERd_ZBFMMA : SchedReadAdvance<2, [V3AEWr_ZBFMMA]>; +def V3AEWr_ZBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; } +def V3AERd_ZBFMAL : SchedReadAdvance<3, [V3AEWr_ZBFMAL]>; + +//===----------------------------------------------------------------------===// +// Define types with long resource cycles (rc) + +def V3AEWrite_6c_1V1_5rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 6; let ReleaseAtCycles = [ 5]; } +def V3AEWrite_9c_1V1_2rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 2]; } +def V3AEWrite_9c_1V1_4rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 9; let ReleaseAtCycles = [ 4]; } +def V3AEWrite_10c_1V1_9rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 10; let ReleaseAtCycles = [ 9]; } +def V3AEWrite_11c_1V1_4rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; } +def V3AEWrite_13c_1V1_8rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; } +def V3AEWrite_14c_1V1_2rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; } + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + +// §3.3 Branch instructions +// ----------------------------------------------------------------------------- + +// Branch, immed +// Compare and branch +def : SchedAlias<WriteBr, V3AEWrite_1c_1B>; + +// Branch, register +def : SchedAlias<WriteBrReg, V3AEWrite_1c_1B>; + +// Branch and link, immed +// Branch and link, register +def : InstRW<[V3AEWrite_1c_1B_1S], (instrs BL, BLR)>; + +// §3.4 Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +// ALU, basic +def : SchedAlias<WriteI, V3AEWrite_1c_1I>; + +// ALU, basic, flagset +def : InstRW<[V3AEWrite_1c_1F_1Flg], + (instregex "^(ADD|SUB)S[WX]r[ir]$", + "^(ADC|SBC)S[WX]r$", + "^ANDS[WX]ri$", + "^(AND|BIC)S[WX]rr$")>; +def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^MOVZ[WX]i$")>; + +// ALU, extend and shift +def : SchedAlias<WriteIEReg, V3AEWrite_2c_1M>; + +// Arithmetic, LSL shift, shift <= 4 +// Arithmetic, flagset, LSL shift, shift <= 4 +// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4 +def : SchedAlias<WriteISReg, V3AEWrite_ArithI>; +def : InstRW<[V3AEWrite_ArithF], + (instregex "^(ADD|SUB)S[WX]rs$")>; + +// Arithmetic, immediate to logical address tag +def : InstRW<[V3AEWrite_2c_1M], (instrs ADDG, SUBG)>; + +// Conditional compare +def : InstRW<[V3AEWrite_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>; + +// Convert floating-point condition flags +// Flag manipulation instructions +def : WriteRes<WriteSys, []> { let Latency = 1; } + +// Insert Random Tags +def : InstRW<[V3AEWrite_2c_1M], (instrs IRG, IRGstack)>; + +// Insert Tag Mask +// Subtract Pointer +def : InstRW<[V3AEWrite_1c_1I], (instrs GMI, SUBP)>; + +// Subtract Pointer, flagset +def : InstRW<[V3AEWrite_1c_1F_1Flg], (instrs SUBPS)>; + +// Logical, shift, no flagset +def : InstRW<[V3AEWrite_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>; +def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^ORR[WX]rs$")>; + +// Logical, shift, flagset +def : InstRW<[V3AEWrite_Logical], (instregex "^(AND|BIC)S[WX]rs$")>; + +// Move and shift instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteImm, V3AEWrite_1c_1I>; + +// §3.5 Divide and multiply instructions +// ----------------------------------------------------------------------------- + +// SDIV, UDIV +def : SchedAlias<WriteID32, V3AEWrite_12c_1M0>; +def : SchedAlias<WriteID64, V3AEWrite_20c_1M0>; + +def : SchedAlias<WriteIM32, V3AEWrite_2c_1M>; +def : SchedAlias<WriteIM64, V3AEWrite_2c_1M>; + +// Multiply +// Multiply accumulate, W-form +// Multiply accumulate, X-form +def : InstRW<[V3AEWr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>; + +// Multiply accumulate long +// Multiply long +def : InstRW<[V3AEWr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; + +// Multiply high +def : InstRW<[V3AEWrite_3c_1M], (instrs SMULHrr, UMULHrr)>; + +// §3.6 Pointer Authentication Instructions (v8.3 PAC) +// ----------------------------------------------------------------------------- + +// Authenticate data address +// Authenticate instruction address +// Compute pointer authentication code for data address +// Compute pointer authentication code, using generic key +// Compute pointer authentication code for instruction address +def : InstRW<[V3AEWrite_4c_1M0], (instregex "^AUT", "^PAC")>; + +// Branch and link, register, with pointer authentication +// Branch, register, with pointer authentication +// Branch, return, with pointer authentication +def : InstRW<[V3AEWrite_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, + BRAAZ, BRAB, BRABZ, RETAA, RETAB, + ERETAA, ERETAB)>; + + +// Load register, with pointer authentication +def : InstRW<[V3AEWrite_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>; + +// Strip pointer authentication code +def : InstRW<[V3AEWrite_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>; + +// §3.7 Miscellaneous data-processing instructions +// ----------------------------------------------------------------------------- + +// Address generation +def : InstRW<[V3AEWrite_1c_1I], (instrs ADR, ADRP)>; + +// Bitfield extract, one reg +// Bitfield extract, two regs +def : SchedAlias<WriteExtr, V3AEWrite_Extr>; +def : InstRW<[V3AEWrite_Extr], (instrs EXTRWrri, EXTRXrri)>; + +// Bitfield move, basic +def : SchedAlias<WriteIS, V3AEWrite_1c_1I>; + +// Bitfield move, insert +def : InstRW<[V3AEWrite_2c_1M], (instregex "^BFM[WX]ri$")>; + +// §3.8 Load instructions +// ----------------------------------------------------------------------------- + +// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3. + +def : SchedAlias<WriteLD, V3AEWrite_4c_1L>; +def : SchedAlias<WriteLDIdx, V3AEWrite_4c_1L>; + +// Load register, literal +def : InstRW<[V3AEWrite_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>; + +// Load pair, signed immed offset, signed words +def : InstRW<[V3AEWrite_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>; + +// Load pair, immed post-index or immed pre-index, signed words +def : InstRW<[WriteAdr, V3AEWrite_5c_1I_3L, WriteLDHi], + (instregex "^LDPSW(post|pre)$")>; + +// §3.9 Store instructions +// ----------------------------------------------------------------------------- + +// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I. + +def : SchedAlias<WriteST, V3AEWrite_1c_1SA_1D>; +def : SchedAlias<WriteSTIdx, V3AEWrite_1c_1SA_1D>; +def : SchedAlias<WriteSTP, V3AEWrite_1c_1SA_1D>; +def : SchedAlias<WriteAdr, V3AEWrite_1c_1I>; + +// §3.10 Tag load instructions +// ----------------------------------------------------------------------------- + +// Load allocation tag +// Load multiple allocation tags +def : InstRW<[V3AEWrite_4c_1L], (instrs LDG, LDGM)>; + +// §3.11 Tag store instructions +// ----------------------------------------------------------------------------- + +// Store allocation tags to one or two granules, post-index +// Store allocation tags to one or two granules, pre-index +// Store allocation tag to one or two granules, zeroing, post-index +// Store Allocation Tag to one or two granules, zeroing, pre-index +// Store allocation tag and reg pair to memory, post-Index +// Store allocation tag and reg pair to memory, pre-Index +def : InstRW<[V3AEWrite_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex, + ST2GPreIndex, ST2GPostIndex, + STZGPreIndex, STZGPostIndex, + STZ2GPreIndex, STZ2GPostIndex, + STGPpre, STGPpost)>; + +// Store allocation tags to one or two granules, signed offset +// Store allocation tag to two granules, zeroing, signed offset +// Store allocation tag and reg pair to memory, signed offset +// Store multiple allocation tags +def : InstRW<[V3AEWrite_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi, + STZ2Gi, STGPi, STGM, STZGM)>; + +// §3.12 FP data processing instructions +// ----------------------------------------------------------------------------- + +// FP absolute value +// FP arithmetic +// FP min/max +// FP negate +// FP select +def : SchedAlias<WriteF, V3AEWrite_2c_1V>; + +// FP compare +def : SchedAlias<WriteFCmp, V3AEWrite_2c_1V0>; + +// FP divide, square root +def : SchedAlias<WriteFDiv, V3AEWrite_6c_1V1>; + +// FP divide, H-form +def : InstRW<[V3AEWrite_6c_1V1], (instrs FDIVHrr)>; +// FP divide, S-form +def : InstRW<[V3AEWrite_8c_1V1], (instrs FDIVSrr)>; +// FP divide, D-form +def : InstRW<[V3AEWrite_13c_1V1], (instrs FDIVDrr)>; + +// FP square root, H-form +def : InstRW<[V3AEWrite_6c_1V1], (instrs FSQRTHr)>; +// FP square root, S-form +def : InstRW<[V3AEWrite_8c_1V1], (instrs FSQRTSr)>; +// FP square root, D-form +def : InstRW<[V3AEWrite_13c_1V1], (instrs FSQRTDr)>; + +// FP multiply +def : WriteRes<WriteFMul, [V3AEUnitV]> { let Latency = 3; } + +// FP multiply accumulate +def : InstRW<[V3AEWr_FMA, ReadDefault, ReadDefault, V3AERd_FMA], + (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; + +// FP round to integral +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$", + "^FRINT(32|64)[XZ][SD]r$")>; + +// §3.13 FP miscellaneous instructions +// ----------------------------------------------------------------------------- + +// FP convert, from gen to vec reg +def : InstRW<[V3AEWrite_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>; + +// FP convert, from vec to gen reg +def : InstRW<[V3AEWrite_3c_1V0], + (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>; + +// FP convert, Javascript from vec to gen reg +def : SchedAlias<WriteFCvt, V3AEWrite_3c_1V0>; + +// FP convert, from vec to vec reg +def : InstRW<[V3AEWrite_3c_1V], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr, + FCVTHDr, FCVTSDr, FCVTXNv1i64)>; + +// FP move, immed +// FP move, register +def : SchedAlias<WriteFImm, V3AEWrite_2c_1V>; + +// FP transfer, from gen to low half of vec reg +def : InstRW<[V3AEWrite_0or3c_1M0], + (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>; + +// FP transfer, from gen to high half of vec reg +def : InstRW<[V3AEWrite_5c_1M0_1V], (instrs FMOVXDHighr)>; + +// FP transfer, from vec to gen reg +def : SchedAlias<WriteFCopy, V3AEWrite_2c_2V>; + +// §3.14 FP load instructions +// ----------------------------------------------------------------------------- + +// Load vector reg, literal, S/D/Q forms +def : InstRW<[V3AEWrite_7c_1I_1L], (instregex "^LDR[SDQ]l$")>; + +// Load vector reg, unscaled immed +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDUR[BHSDQ]i$")>; + +// Load vector reg, immed post-index +// Load vector reg, immed pre-index +def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L], + (instregex "^LDR[BHSDQ](pre|post)$")>; + +// Load vector reg, unsigned immed +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDR[BHSDQ]ui$")>; + +// Load vector reg, register offset, basic +// Load vector reg, register offset, scale, S/D-form +// Load vector reg, register offset, scale, H/Q-form +// Load vector reg, register offset, extend +// Load vector reg, register offset, extend, scale, S/D-form +// Load vector reg, register offset, extend, scale, H/Q-form +def : InstRW<[V3AEWrite_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>; + +// Load vector pair, immed offset, S/D-form +def : InstRW<[V3AEWrite_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>; + +// Load vector pair, immed offset, Q-form +def : InstRW<[V3AEWrite_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>; + +// Load vector pair, immed post-index, S/D-form +// Load vector pair, immed pre-index, S/D-form +def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L, WriteLDHi], + (instregex "^LDP[SD](pre|post)$")>; + +// Load vector pair, immed post-index, Q-form +// Load vector pair, immed pre-index, Q-form +def : InstRW<[WriteAdr, V3AEWrite_6c_2I_2L, WriteLDHi], (instrs LDPQpost, + LDPQpre)>; + +// §3.15 FP store instructions +// ----------------------------------------------------------------------------- + +// Store vector reg, unscaled immed, B/H/S/D-form +// Store vector reg, unscaled immed, Q-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STUR[BHSDQ]i$")>; + +// Store vector reg, immed post-index, B/H/S/D-form +// Store vector reg, immed post-index, Q-form +// Store vector reg, immed pre-index, B/H/S/D-form +// Store vector reg, immed pre-index, Q-form +def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I], + (instregex "^STR[BHSDQ](pre|post)$")>; + +// Store vector reg, unsigned immed, B/H/S/D-form +// Store vector reg, unsigned immed, Q-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STR[BHSDQ]ui$")>; + +// Store vector reg, register offset, basic, B/H/S/D-form +// Store vector reg, register offset, basic, Q-form +// Store vector reg, register offset, scale, H-form +// Store vector reg, register offset, scale, S/D-form +// Store vector reg, register offset, scale, Q-form +// Store vector reg, register offset, extend, B/H/S/D-form +// Store vector reg, register offset, extend, Q-form +// Store vector reg, register offset, extend, scale, H-form +// Store vector reg, register offset, extend, scale, S/D-form +// Store vector reg, register offset, extend, scale, Q-form +def : InstRW<[V3AEWrite_StrHQ, ReadAdrBase], + (instregex "^STR[BHSDQ]ro[WX]$")>; + +// Store vector pair, immed offset, S-form +// Store vector pair, immed offset, D-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STN?P[SD]i$")>; + +// Store vector pair, immed offset, Q-form +def : InstRW<[V3AEWrite_2c_1SA_2V], (instrs STPQi, STNPQi)>; + +// Store vector pair, immed post-index, S-form +// Store vector pair, immed post-index, D-form +// Store vector pair, immed pre-index, S-form +// Store vector pair, immed pre-index, D-form +def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I], + (instregex "^STP[SD](pre|post)$")>; + +// Store vector pair, immed post-index, Q-form +def : InstRW<[V3AEWrite_2c_1SA_2V_1I], (instrs STPQpost)>; + +// Store vector pair, immed pre-index, Q-form +def : InstRW<[V3AEWrite_2c_1SA_2V_2I], (instrs STPQpre)>; + +// §3.16 ASIMD integer instructions +// ----------------------------------------------------------------------------- + +// ASIMD absolute diff +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD arith, pair-wise +// ASIMD compare +// ASIMD logical +// ASIMD max/min, basic and pair-wise +def : SchedAlias<WriteVd, V3AEWrite_2c_1V>; +def : SchedAlias<WriteVq, V3AEWrite_2c_1V>; + +// ASIMD absolute diff accum +// ASIMD absolute diff accum long +def : InstRW<[V3AEWr_VA, V3AERd_VA], (instregex "^[SU]ABAL?v")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[V3AEWrite_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; + +// ASIMD arith, reduce, 8B/8H +def : InstRW<[V3AEWrite_5c_1V1_1V], + (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; + +// ASIMD arith, reduce, 16B +def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>; + +// ASIMD dot product +// ASIMD dot product using signed and unsigned integers +def : InstRW<[V3AEWr_VDOT, V3AERd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; + +// ASIMD matrix multiply-accumulate +def : InstRW<[V3AEWr_VMMA, V3AERd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[V3AEWrite_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$", + "^[SU](MAX|MIN)Vv4i32v$")>; + +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[V3AEWrite_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", + "^[SU](MAX|MIN)Vv8i16v$")>; + +// ASIMD max/min, reduce, 16B +def : InstRW<[V3AEWrite_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>; + +// ASIMD multiply +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>; + +// ASIMD multiply accumulate +def : InstRW<[V3AEWr_VMA, V3AERd_VMA], (instregex "^MLAv", "^MLSv")>; + +// ASIMD multiply accumulate high +def : InstRW<[V3AEWr_VMAH, V3AERd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + +// ASIMD multiply accumulate long +def : InstRW<[V3AEWr_VMAL, V3AERd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + +// ASIMD multiply accumulate saturating long +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDML[AS]L[iv]")>; + +// ASIMD multiply/multiply long (8x8) polynomial, D-form +// ASIMD multiply/multiply long (8x8) polynomial, Q-form +def : InstRW<[V3AEWrite_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>; + +// ASIMD multiply long +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>; + +// ASIMD pairwise add and accumulate long +def : InstRW<[V3AEWr_VPA, V3AERd_VPA], (instregex "^[SU]ADALPv")>; + +// ASIMD shift accumulate +def : InstRW<[V3AEWr_VSA, V3AERd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>; + +// ASIMD shift by immed, basic +def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv", + "^SSHLLv", "^SSHR[dv]", "^USHLLv", + "^USHR[dv]")>; + +// ASIMD shift by immed and insert, basic +def : InstRW<[V3AEWrite_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>; + +// ASIMD shift by immed, complex +def : InstRW<[V3AEWrite_4c_1V], + (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$", + "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", + "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]", + "^UQSHRN[bhsv]", "^URSHR[dv]")>; + +// ASIMD shift by register, basic +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]SHLv")>; + +// ASIMD shift by register, complex +def : InstRW<[V3AEWrite_4c_1V], + (instregex "^[SU]RSHLv", "^[SU]QRSHLv", + "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>; + +// §3.17 ASIMD floating-point instructions +// ----------------------------------------------------------------------------- + +// ASIMD FP absolute value/difference +// ASIMD FP arith, normal +// ASIMD FP compare +// ASIMD FP complex add +// ASIMD FP max/min, normal +// ASIMD FP max/min, pairwise +// ASIMD FP negate +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD FP complex multiply add +def : InstRW<[V3AEWr_VFCMA, V3AERd_VFCMA], (instregex "^FCMLAv")>; + +// ASIMD FP convert, long (F16 to F32) +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>; + +// ASIMD FP convert, long (F32 to F64) +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTL(v2|v4)i32")>; + +// ASIMD FP convert, narrow (F32 to F16) +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTN(v4|v8)i16")>; + +// ASIMD FP convert, narrow (F64 to F32) +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTN(v2|v4)i32", + "^FCVTXN(v2|v4)f32")>; + +// ASIMD FP convert, other, D-form F32 and Q-form F64 +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$", + "^FCVT[AMNPZ][SU]v2i(32|64)_shift$", + "^FCVT[AMNPZ][SU]v1i64$", + "^FCVTZ[SU]d$", + "^[SU]CVTFv2f(32|64)$", + "^[SU]CVTFv2i(32|64)_shift$", + "^[SU]CVTFv1i64$", + "^[SU]CVTFd$")>; + +// ASIMD FP convert, other, D-form F16 and Q-form F32 +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$", + "^FCVT[AMNPZ][SU]v4i(16|32)_shift$", + "^FCVT[AMNPZ][SU]v1i32$", + "^FCVTZ[SU]s$", + "^[SU]CVTFv4f(16|32)$", + "^[SU]CVTFv4i(16|32)_shift$", + "^[SU]CVTFv1i32$", + "^[SU]CVTFs$")>; + +// ASIMD FP convert, other, Q-form F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVT[AMNPZ][SU]v8f16$", + "^FCVT[AMNPZ][SU]v8i16_shift$", + "^FCVT[AMNPZ][SU]v1f16$", + "^FCVTZ[SU]h$", + "^[SU]CVTFv8f16$", + "^[SU]CVTFv8i16_shift$", + "^[SU]CVTFv1i16$", + "^[SU]CVTFh$")>; + +// ASIMD FP divide, D-form, F16 +def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FDIVv4f16)>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F16 +def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FDIVv8f16)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, reduce, F32 and D-form F16 +def : InstRW<[V3AEWrite_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; + +// ASIMD FP max/min, reduce, Q-form F16 +def : InstRW<[V3AEWrite_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; + +// ASIMD FP multiply +def : InstRW<[V3AEWr_VFM], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate +def : InstRW<[V3AEWr_VFMA, V3AERd_VFMA], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[V3AEWr_VFMAL, V3AERd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>; + +// ASIMD FP round, D-form F32 and Q-form F64 +def : InstRW<[V3AEWrite_3c_1V0], + (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", + "^FRINT(32|64)[XZ]v2f(32|64)$")>; + +// ASIMD FP round, D-form F16 and Q-form F32 +def : InstRW<[V3AEWrite_4c_2V0], + (instregex "^FRINT[AIMNPXZ]v4f(16|32)$", + "^FRINT(32|64)[XZ]v4f32$")>; + +// ASIMD FP round, Q-form F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>; + +// ASIMD FP square root, D-form, F16 +def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FSQRTv4f16)>; + +// ASIMD FP square root, D-form, F32 +def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FSQRTv2f32)>; + +// ASIMD FP square root, Q-form, F16 +def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FSQRTv8f16)>; + +// ASIMD FP square root, Q-form, F32 +def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FSQRTv4f32)>; + +// ASIMD FP square root, Q-form, F64 +def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FSQRTv2f64)>; + +// §3.18 ASIMD BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// ASIMD convert, F32 to BF16 +def : InstRW<[V3AEWrite_4c_2V0], (instrs BFCVTN, BFCVTN2)>; + +// ASIMD dot product +def : InstRW<[V3AEWr_VBFDOT, V3AERd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>; + +// ASIMD matrix multiply accumulate +def : InstRW<[V3AEWr_VBFMMA, V3AERd_VBFMMA], (instrs BFMMLA)>; + +// ASIMD multiply accumulate long +def : InstRW<[V3AEWr_VBFMAL, V3AERd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT, + BFMLALTIdx)>; + +// Scalar convert, F32 to BF16 +def : InstRW<[V3AEWrite_3c_1V0], (instrs BFCVT)>; + +// §3.19 ASIMD miscellaneous instructions +// ----------------------------------------------------------------------------- + +// ASIMD bit reverse +// ASIMD bitwise insert +// ASIMD count +// ASIMD duplicate, element +// ASIMD extract +// ASIMD extract narrow +// ASIMD insert, element to element +// ASIMD move, FP immed +// ASIMD move, integer immed +// ASIMD reverse +// ASIMD table lookup extension, 1 table reg +// ASIMD transpose +// ASIMD unzip/zip +// Handled by SchedAlias<WriteV[dq], ...> +def : InstRW<[V3AEWrite_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>; + +// ASIMD duplicate, gen reg +def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract narrow, saturating +def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>; + +// ASIMD reciprocal and square root estimate, D-form U32 +def : InstRW<[V3AEWrite_3c_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>; + +// ASIMD reciprocal and square root estimate, Q-form U32 +def : InstRW<[V3AEWrite_4c_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>; + +// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms +def : InstRW<[V3AEWrite_3c_1V0], (instrs FRECPEv1f16, FRECPEv1i32, + FRECPEv1i64, FRECPEv2f32, + FRSQRTEv1f16, FRSQRTEv1i32, + FRSQRTEv1i64, FRSQRTEv2f32)>; + +// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 +def : InstRW<[V3AEWrite_4c_2V0], (instrs FRECPEv4f16, FRECPEv4f32, + FRSQRTEv4f16, FRSQRTEv4f32)>; + +// ASIMD reciprocal and square root estimate, Q-form F16 +def : InstRW<[V3AEWrite_6c_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>; + +// ASIMD reciprocal exponent +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRECPXv")>; + +// ASIMD reciprocal step +def : InstRW<[V3AEWrite_4c_1V], (instregex "^FRECPS(32|64|v)", + "^FRSQRTS(32|64|v)")>; + +// ASIMD table lookup, 1 or 2 table regs +def : InstRW<[V3AEWrite_2c_1V], (instrs TBLv8i8One, TBLv16i8One, + TBLv8i8Two, TBLv16i8Two)>; + +// ASIMD table lookup, 3 table regs +def : InstRW<[V3AEWrite_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>; + +// ASIMD table lookup, 4 table regs +def : InstRW<[V3AEWrite_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>; + +// ASIMD table lookup extension, 2 table reg +def : InstRW<[V3AEWrite_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// ASIMD table lookup extension, 3 table reg +def : InstRW<[V3AEWrite_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>; + +// ASIMD table lookup extension, 4 table reg +def : InstRW<[V3AEWrite_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, element to gen reg +def : InstRW<[V3AEWrite_2c_2V], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, gen reg to element +def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>; + +// §3.20 ASIMD load instructions +// ----------------------------------------------------------------------------- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_1L], + (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_1L], + (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_2L], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_2L], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_3L], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_3L], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_7c_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_7c_4L], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[V3AEWrite_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>; + +// ASIMD load, 2 element, multiple, Q-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[V3AEWrite_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lane, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +def : InstRW<[V3AEWrite_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[V3AEWrite_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +def : InstRW<[V3AEWrite_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[V3AEWrite_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; + +// §3.21 ASIMD store instructions +// ----------------------------------------------------------------------------- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[V3AEWrite_2c_3SA_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_3SA_3V], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[V3AEWrite_2c_4SA_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_2c_4SA_4V], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2Twov(8b|4h|2s)_POST$")>; + +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_4c_2SA_4V], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[V3AEWrite_4c_1SA_2V], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_5c_2SA_4V], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3Threev(8b|4h|2s)_POST$")>; + +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_6c_3SA_6V], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_3SA_6V], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[V3AEWrite_5c_2SA_4V], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +def : InstRW<[V3AEWrite_6c_2SA_6V], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_2SA_6V], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, B/H/S +def : InstRW<[V3AEWrite_7c_4SA_12V], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[WriteAdr, V3AEWrite_7c_4SA_12V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[V3AEWrite_5c_4SA_8V], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[WriteAdr, V3AEWrite_5c_4SA_8V], (instregex "ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H/S +def : InstRW<[V3AEWrite_6c_1SA_3V], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[WriteAdr, V3AEWrite_6c_1SA_3V], (instregex "ST4i(8|16|32)_POST$")>; + +// ASIMD store, 4 element, one lane, D +def : InstRW<[V3AEWrite_4c_2SA_4V], (instregex "ST4i(64)$")>; +def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V], (instregex "ST4i(64)_POST$")>; + +// §3.22 Cryptography extensions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[V3AEWrite_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 hash acceleration op +// Crypto SHA1 schedule acceleration ops +def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>; + +// Crypto SHA1 hash acceleration ops +// Crypto SHA256 hash acceleration ops +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>; + +// Crypto SHA256 schedule acceleration ops +def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA256SU[01]")>; + +// Crypto SHA512 hash acceleration ops +def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>; + +// Crypto SHA3 ops +def : InstRW<[V3AEWrite_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>; + +// Crypto SM3 ops +def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$", + "^SM3TT[12][AB]$")>; + +// Crypto SM4 ops +def : InstRW<[V3AEWrite_4c_1V0], (instrs SM4E, SM4ENCKEY)>; + +// §3.23 CRC +// ----------------------------------------------------------------------------- + +def : InstRW<[V3AEWr_CRC, V3AERd_CRC], (instregex "^CRC32")>; + +// §3.24 SVE Predicate instructions +// ----------------------------------------------------------------------------- + +// Loop control, based on predicate +def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP, + BRKB_PPmP, BRKB_PPzP)>; + +// Loop control, based on predicate and flag setting +def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>; + +// Loop control, propagating +def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP, + BRKPB_PPzPP)>; + +// Loop control, propagating and flag setting +def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP, + BRKPBS_PPzPP)>; + +// Loop control, based on GPR +def : InstRW<[V3AEWrite_3c_2M], + (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; +def : InstRW<[V3AEWrite_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; + +// Loop terminate +def : InstRW<[V3AEWrite_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; + +// Predicate counting scalar +def : InstRW<[V3AEWrite_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; +def : InstRW<[V3AEWrite_2c_1M], + (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI", + "^SQ(DEC|INC)[BHWD]_XPiWdI", + "^UQ(DEC|INC)[BHWD]_WPiI")>; + +// Predicate counting scalar, ALL, {1,2,4} +def : InstRW<[V3AEWrite_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>; + +// Predicate counting scalar, active predicate +def : InstRW<[V3AEWrite_2c_1M], + (instregex "^CNTP_XPP_[BHSD]", + "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", + "^(UQDEC|UQINC)P_WP_[BHSD]", + "^(SQDEC|SQINC)P_XPWd_[BHSD]")>; + +// Predicate counting vector, active predicate +def : InstRW<[V3AEWrite_7c_1M_1M0_1V], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; + +// Predicate logical +def : InstRW<[V3AEWrite_1or2c_1M], + (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; + +// Predicate logical, flag setting +def : InstRW<[V3AEWrite_1or2c_1M], + (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; + +// Predicate reverse +def : InstRW<[V3AEWrite_2c_1M], (instregex "^REV_PP_[BHSD]")>; + +// Predicate select +def : InstRW<[V3AEWrite_1c_1M], (instrs SEL_PPPP)>; + +// Predicate set +def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; + +// Predicate set/initialize, set flags +def : InstRW<[V3AEWrite_2c_1M], (instregex "^PTRUES_[BHSD]")>; + +// Predicate find first/next +def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; + +// Predicate test +def : InstRW<[V3AEWrite_1c_1M], (instrs PTEST_PP)>; + +// Predicate transpose +def : InstRW<[V3AEWrite_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>; + +// Predicate unpack and widen +def : InstRW<[V3AEWrite_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>; + +// Predicate zip/unzip +def : InstRW<[V3AEWrite_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>; + +// §3.25 SVE integer instructions +// ----------------------------------------------------------------------------- + +// Arithmetic, absolute diff +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]", + "^[SU]ABD_ZPZZ_[BHSD]")>; + +// Arithmetic, absolute diff accum +def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; + +// Arithmetic, absolute diff accum long +def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, absolute diff long +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, basic +def : InstRW<[V3AEWrite_2c_1V], + (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]", + "^(ADD|SUB)_ZZZ_[BHSD]", + "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]", + "^(ADD|SUB|SUBR)_ZI_[BHSD]", + "^ADR_[SU]XTW_ZZZ_D_[0123]", + "^ADR_LSL_ZZZ_[SD]_[0123]", + "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", + "^SADDLBT_ZZZ_[HSD]", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", + "^SSUBL(BT|TB)_ZZZ_[HSD]")>; + +// Arithmetic, complex +def : InstRW<[V3AEWrite_2c_1V], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", + "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]", + "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", + "^[SU]Q(ADD|SUB)_ZI_[BHSD]", + "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", + "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; + +// Arithmetic, large integer +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; + +// Arithmetic, pairwise add +def : InstRW<[V3AEWrite_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>; + +// Arithmetic, pairwise add and accum long +def : InstRW<[V3AEWr_ZPA, ReadDefault, V3AERd_ZPA], + (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; + +// Arithmetic, shift +def : InstRW<[V3AEWrite_2c_1V1], + (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", + "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", + "^(ASR|LSL|LSR)_ZPmI_[BHSD]", + "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", + "^(ASR|LSL|LSR)_ZZI_[BHSD]", + "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]", + "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; + +// Arithmetic, shift and accumulate +def : InstRW<[V3AEWr_ZSA, V3AERd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>; + +// Arithmetic, shift by immediate +def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]", + "^[SU]SHLL[BT]_ZZI_[HSD]")>; + +// Arithmetic, shift by immediate and insert +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>; + +// Arithmetic, shift complex +def : InstRW<[V3AEWrite_4c_1V], + (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", + "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]", + "^[SU]QR?SHL_ZPZZ_[BHSD]", + "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", + "^SQSHRU?N[BT]_ZZI_[BHS]", + "^UQR?SHRN[BT]_ZZI_[BHS]")>; + +// Arithmetic, shift right for divide +def : InstRW<[V3AEWrite_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>; + +// Arithmetic, shift rounding +def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]", + "^[SU]RSHL_ZPZZ_[BHSD]", + "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>; + +// Bit manipulation +def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>; + +// Bitwise select +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; + +// Count/reverse bits +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>; + +// Broadcast logical bitmask immediate to vector +def : InstRW<[V3AEWrite_2c_1V], (instrs DUPM_ZI)>; + +// Compare and set flags +def : InstRW<[V3AEWrite_2or3c_1V0], + (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", + "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; + +// Complex add +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>; + +// Complex dot product 8-bit element +def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; + +// Complex dot product 16-bit element +def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; + +// Complex multiply-add B, H, S element size +def : InstRW<[V3AEWr_ZCMABHS, V3AERd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]", + "^CMLA_ZZZI_[HS]")>; + +// Complex multiply-add D element size +def : InstRW<[V3AEWr_ZCMAD, V3AERd_ZCMAD], (instrs CMLA_ZZZ_D)>; + +// Conditional extract operations, scalar form +def : InstRW<[V3AEWrite_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; + +// Conditional extract operations, SIMD&FP scalar and vector forms +def : InstRW<[V3AEWrite_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", + "^COMPACT_ZPZ_[SD]", + "^SPLICE_ZPZZ?_[BHSD]")>; + +// Convert to floating point, 64b to float or convert to double +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]", + "^[SU]CVTF_ZPmZ_StoD")>; + +// Convert to floating point, 32b to single or half +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; + +// Convert to floating point, 16b to half +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; + +// Copy, scalar +def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>; + +// Copy, scalar SIMD&FP or imm +def : InstRW<[V3AEWrite_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]", + "^CPY_ZPzI_[BHSD]")>; + +// Divides, 32 bit +def : InstRW<[V3AEWrite_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S", + "^[SU]DIV_ZPZZ_S")>; + +// Divides, 64 bit +def : InstRW<[V3AEWrite_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", + "^[SU]DIV_ZPZZ_D")>; + +// Dot product, 8 bit +def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>; + +// Dot product, 8 bit, using signed and unsigned integers +def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; + +// Dot product, 16 bit +def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>; + +// Duplicate, immediate and indexed form +def : InstRW<[V3AEWrite_2c_1V], (instregex "^DUP_ZI_[BHSD]", + "^DUP_ZZI_[BHSDQ]")>; + +// Duplicate, scalar form +def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>; + +// Extend, sign or zero +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]", + "^[SU]XTH_ZPmZ_[SD]", + "^[SU]XTW_ZPmZ_[D]")>; + +// Extract +def : InstRW<[V3AEWrite_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; + +// Extract narrow saturating +def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", + "^SQXTUN[BT]_ZZ_[BHS]")>; + +// Extract operation, SIMD and FP scalar form +def : InstRW<[V3AEWrite_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>; + +// Extract operation, scalar +def : InstRW<[V3AEWrite_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>; + +// Histogram operations +def : InstRW<[V3AEWrite_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]", + "^HISTSEG_ZZZ")>; + +// Horizontal operations, B, H, S form, immediate operands only +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^INDEX_II_[BHS]")>; + +// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar +// operands only / immediate, scalar operands +def : InstRW<[V3AEWrite_7c_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; + +// Horizontal operations, D form, immediate operands only +def : InstRW<[V3AEWrite_5c_2V0], (instrs INDEX_II_D)>; + +// Horizontal operations, D form, scalar, immediate operands)/ scalar operands +// only / immediate, scalar operands +def : InstRW<[V3AEWrite_8c_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D")>; + +// insert operation, SIMD and FP scalar form +def : InstRW<[V3AEWrite_2c_1V], (instregex "^INSR_ZV_[BHSD]")>; + +// insert operation, scalar +def : InstRW<[V3AEWrite_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>; + +// Logical +def : InstRW<[V3AEWrite_2c_1V], + (instregex "^(AND|EOR|ORR)_ZI", + "^(AND|BIC|EOR|ORR)_ZZZ", + "^EOR(BT|TB)_ZZZ_[BHSD]", + "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]", + "^NOT_ZPmZ_[BHSD]")>; + +// Max/min, basic and pairwise +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", + "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]", + "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>; + +// Matching operations +// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the +// latency for this instruction is 4 cycles. +def : InstRW<[V3AEWrite_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>; + +// Matrix multiply-accumulate +def : InstRW<[V3AEWr_ZMMA, V3AERd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; + +// Move prefix +def : InstRW<[V3AEWrite_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", + "^MOVPRFX_ZZ")>; + +// Multiply, B, H, S element size +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", + "^MUL_ZPZZ_[BHS]", + "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]", + "^[SU]MULH_ZPZZ_[BHS]")>; + +// Multiply, D element size +def : InstRW<[V3AEWrite_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", + "^MUL_ZPZZ_D", + "^[SU]MULH_(ZPmZ|ZZZ)_D", + "^[SU]MULH_ZPZZ_D")>; + +// Multiply long +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", + "^[SU]MULL[BT]_ZZZ_[HSD]")>; + +// Multiply accumulate, B, H, S element size +def : InstRW<[V3AEWr_ZMABHS, V3AERd_ZMABHS], + (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>; +def : InstRW<[V3AEWr_ZMABHS, ReadDefault, V3AERd_ZMABHS], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; + +// Multiply accumulate, D element size +def : InstRW<[V3AEWr_ZMAD, V3AERd_ZMAD], + (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>; +def : InstRW<[V3AEWr_ZMAD, ReadDefault, V3AERd_ZMAD], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; + +// Multiply accumulate long +def : InstRW<[V3AEWr_ZMAL, V3AERd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", + "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; + +// Multiply accumulate saturating doubling long regular +def : InstRW<[V3AEWr_ZMASQL, V3AERd_ZMASQ], + (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]", + "^SQDML[AS]L[BT]_ZZZI_[SD]")>; + +// Multiply saturating doubling high, B, H, S element size +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]", + "^SQDMULH_ZZZI_[HS]")>; + +// Multiply saturating doubling high, D element size +def : InstRW<[V3AEWrite_5c_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; + +// Multiply saturating doubling long +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", + "^SQDMULL[BT]_ZZZI_[SD]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, B, H, S +// element size +def : InstRW<[V3AEWr_ZMASQBHS, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", + "^SQRDCMLAH_ZZZ_[BHS]", + "^SQRDML[AS]H_ZZZI_[HS]", + "^SQRDCMLAH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, D element +// size +def : InstRW<[V3AEWr_ZMASQD, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D", + "^SQRDCMLAH_ZZZ_D")>; + +// Multiply saturating rounding doubling regular/complex, B, H, S element size +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]", + "^SQRDMULH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex, D element size +def : InstRW<[V3AEWrite_5c_2V0], (instregex "^SQRDMULH_ZZZI?_D")>; + +// Multiply/multiply long, (8x8) polynomial +def : InstRW<[V3AEWrite_2c_1V], (instregex "^PMUL_ZZZ_B", + "^PMULL[BT]_ZZZ_[HDQ]")>; + +// Predicate counting vector +def : InstRW<[V3AEWrite_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>; + +// Reciprocal estimate +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; + +// Reduction, arithmetic, B form +def : InstRW<[V3AEWrite_9c_2V_4V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; + +// Reduction, arithmetic, H form +def : InstRW<[V3AEWrite_8c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; + +// Reduction, arithmetic, S form +def : InstRW<[V3AEWrite_6c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; + +// Reduction, arithmetic, D form +def : InstRW<[V3AEWrite_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; + +// Reduction, logical +def : InstRW<[V3AEWrite_6c_1V_1V1], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>; + +// Reverse, vector +def : InstRW<[V3AEWrite_2c_1V], (instregex "^REV_ZZ_[BHSD]", + "^REVB_ZPmZ_[HSD]", + "^REVH_ZPmZ_[SD]", + "^REVW_ZPmZ_D")>; + +// Select, vector form +def : InstRW<[V3AEWrite_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>; + +// Table lookup +def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>; + +// Table lookup extension +def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>; + +// Transpose, vector form +def : InstRW<[V3AEWrite_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; + +// Unpack and extend +def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; + +// Zip/unzip +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; + +// §3.26 SVE floating-point instructions +// ----------------------------------------------------------------------------- + +// Floating point absolute value/difference +def : InstRW<[V3AEWrite_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]", + "^FABD_ZPZZ_[HSD]", + "^FABS_ZPmZ_[HSD]")>; + +// Floating point arithmetic +def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]", + "^F(ADD|SUB)_ZPZ[IZ]_[HSD]", + "^FADDP_ZPmZZ_[HSD]", + "^FNEG_ZPmZ_[HSD]", + "^FSUBR_ZPm[IZ]_[HSD]", + "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>; + +// Floating point associative add, F16 +def : InstRW<[V3AEWrite_10c_1V1_9rc], (instrs FADDA_VPZ_H)>; + +// Floating point associative add, F32 +def : InstRW<[V3AEWrite_6c_1V1_5rc], (instrs FADDA_VPZ_S)>; + +// Floating point associative add, F64 +def : InstRW<[V3AEWrite_4c_1V], (instrs FADDA_VPZ_D)>; + +// Floating point compare +def : InstRW<[V3AEWrite_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]", + "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", + "^FCM(LE|LT)_PPzZ0_[HSD]", + "^FCMUO_PPzZZ_[HSD]")>; + +// Floating point complex add +def : InstRW<[V3AEWrite_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>; + +// Floating point complex multiply add +def : InstRW<[V3AEWr_ZFCMA, ReadDefault, V3AERd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[V3AEWr_ZFCMA, V3AERd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; + +// Floating point convert, long or narrow (F16 to F32 or F32 to F16) +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", + "^FCVTLT_ZPmZ_HtoS", + "^FCVTNT_ZPmZ_StoH")>; + +// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 +// or F64 to F16) +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", + "^FCVTLT_ZPmZ_StoD", + "^FCVTNT_ZPmZ_DtoS")>; + +// Floating point convert, round to odd +def : InstRW<[V3AEWrite_3c_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>; + +// Floating point base2 log, F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; + +// Floating point base2 log, F32 +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; + +// Floating point base2 log, F64 +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; + +// Floating point convert to integer, F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; + +// Floating point convert to integer, F32 +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; + +// Floating point convert to integer, F64 +def : InstRW<[V3AEWrite_3c_1V0], + (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; + +// Floating point copy +def : InstRW<[V3AEWrite_2c_1V], (instregex "^FCPY_ZPmI_[HSD]", + "^FDUP_ZI_[HSD]")>; + +// Floating point divide, F16 +def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; + +// Floating point divide, F32 +def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; + +// Floating point divide, F64 +def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; + +// Floating point min/max pairwise +def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; + +// Floating point min/max +def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]", + "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>; + +// Floating point multiply +def : InstRW<[V3AEWrite_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", + "^FMULX_ZPZZ_[HSD]", + "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]", + "^FMUL_ZPZ[IZ]_[HSD]")>; + +// Floating point multiply accumulate +def : InstRW<[V3AEWr_ZFMA, ReadDefault, V3AERd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[V3AEWr_ZFMA, V3AERd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; + +// Floating point multiply add/sub accumulate long +def : InstRW<[V3AEWr_ZFMAL, V3AERd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; + +// Floating point reciprocal estimate, F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>; + +// Floating point reciprocal estimate, F32 +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>; + +// Floating point reciprocal estimate, F64 +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>; + +// Floating point reciprocal step +def : InstRW<[V3AEWrite_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; + +// Floating point reduction, F16 +def : InstRW<[V3AEWrite_8c_4V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>; + +// Floating point reduction, F32 +def : InstRW<[V3AEWrite_6c_3V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>; + +// Floating point reduction, F64 +def : InstRW<[V3AEWrite_4c_2V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>; + +// Floating point round to integral, F16 +def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; + +// Floating point round to integral, F32 +def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; + +// Floating point round to integral, F64 +def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; + +// Floating point square root, F16 +def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>; + +// Floating point square root, F32 +def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>; + +// Floating point square root, F64 +def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>; + +// Floating point trigonometric exponentiation +def : InstRW<[V3AEWrite_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>; + +// Floating point trigonometric multiply add +def : InstRW<[V3AEWrite_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>; + +// Floating point trigonometric, miscellaneous +def : InstRW<[V3AEWrite_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>; + +// §3.27 SVE BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// Convert, F32 to BF16 +def : InstRW<[V3AEWrite_4c_1V], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; + +// Dot product +def : InstRW<[V3AEWr_ZBFDOT, V3AERd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; + +// Matrix multiply accumulate +def : InstRW<[V3AEWr_ZBFMMA, V3AERd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>; + +// Multiply accumulate long +def : InstRW<[V3AEWr_ZBFMAL, V3AERd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>; + +// §3.28 SVE Load instructions +// ----------------------------------------------------------------------------- + +// Load vector +def : InstRW<[V3AEWrite_6c_1L], (instrs LDR_ZXI)>; + +// Load predicate +def : InstRW<[V3AEWrite_6c_1L_1M], (instrs LDR_PXI)>; + +// Contiguous load, scalar + imm +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]_IMM$", + "^LD1S?B_[HSD]_IMM$", + "^LD1S?H_[SD]_IMM$", + "^LD1S?W_D_IMM$" )>; +// Contiguous load, scalar + scalar +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]$", + "^LD1S?B_[HSD]$", + "^LD1S?H_[SD]$", + "^LD1S?W_D$" )>; + +// Contiguous load broadcast, scalar + imm +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1R[BHWD]_IMM$", + "^LD1RS?B_[HSD]_IMM$", + "^LD1RS?H_[SD]_IMM$", + "^LD1RW_D_IMM$", + "^LD1RSW_IMM$", + "^LD1RQ_[BHWD]_IMM$")>; + +// Contiguous load broadcast, scalar + scalar +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1RQ_[BHWD]$")>; + +// Non temporal load, scalar + imm +// Non temporal load, scalar + scalar +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>; + +// Non temporal gather load, vector + scalar 32-bit element size +def : InstRW<[V3AEWrite_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$", + "^LDNT1S[BH]_ZZR_S$")>; + +// Non temporal gather load, vector + scalar 64-bit element size +def : InstRW<[V3AEWrite_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>; +def : InstRW<[V3AEWrite_9c_2L_2V], (instrs LDNT1D_ZZR_D)>; + +// Contiguous first faulting load, scalar + scalar +def : InstRW<[V3AEWrite_6c_1L_1I], (instregex "^LDFF1[BHWD]$", + "^LDFF1S?B_[HSD]$", + "^LDFF1S?H_[SD]$", + "^LDFF1S?W_D$")>; + +// Contiguous non faulting load, scalar + imm +def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNF1[BHWD]_IMM$", + "^LDNF1S?B_[HSD]_IMM$", + "^LDNF1S?H_[SD]_IMM$", + "^LDNF1S?W_D_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + imm +def : InstRW<[V3AEWrite_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + scalar +def : InstRW<[V3AEWrite_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>; + +// Contiguous Load three structures to three vectors, scalar + imm +def : InstRW<[V3AEWrite_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>; + +// Contiguous Load three structures to three vectors, scalar + scalar +def : InstRW<[V3AEWrite_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>; + +// Contiguous Load four structures to four vectors, scalar + imm +def : InstRW<[V3AEWrite_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>; + +// Contiguous Load four structures to four vectors, scalar + scalar +def : InstRW<[V3AEWrite_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>; + +// Gather load, vector + imm, 32-bit element size +def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$", + "^GLD(FF)?1W_IMM$")>; + +// Gather load, vector + imm, 64-bit element size +def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$", + "^GLD(FF)?1D_IMM$")>; + +// Gather load, 32-bit scaled offset +def : InstRW<[V3AEWrite_10c_1L_8V], + (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$", + "^GLD(FF)?1W_[SU]XTW_SCALED")>; + +// Gather load, 64-bit scaled offset +// NOTE: These instructions are not specified in the SOG. +def : InstRW<[V3AEWrite_10c_1L_4V], + (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$", + "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>; + +// Gather load, 32-bit unpacked unscaled offset +def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", + "^GLD(FF)?1W_[SU]XTW$")>; + +// Gather load, 64-bit unpacked unscaled offset +// NOTE: These instructions are not specified in the SOG. +def : InstRW<[V3AEWrite_9c_1L_2V], + (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$", + "^GLD(FF)?1D(_[SU]XTW)?$")>; + +// §3.29 SVE Store instructions +// ----------------------------------------------------------------------------- + +// Store from predicate reg +def : InstRW<[V3AEWrite_1c_1SA], (instrs STR_PXI)>; + +// Store from vector reg +def : InstRW<[V3AEWrite_2c_1SA_1V], (instrs STR_ZXI)>; + +// Contiguous store, scalar + imm +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BHWD]_IMM$", + "^ST1B_[HSD]_IMM$", + "^ST1H_[SD]_IMM$", + "^ST1W_D_IMM$")>; + +// Contiguous store, scalar + scalar +def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instregex "^ST1H(_[SD])?$")>; +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BWD]$", + "^ST1B_[HSD]$", + "^ST1W_D$")>; + +// Contiguous store two structures from two vectors, scalar + imm +def : InstRW<[V3AEWrite_4c_1SA_1V], (instregex "^ST2[BHWD]_IMM$")>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[V3AEWrite_4c_2SA_2I_2V], (instrs ST2H)>; +def : InstRW<[V3AEWrite_4c_2SA_2V], (instregex "^ST2[BWD]$")>; + +// Contiguous store three structures from three vectors, scalar + imm +def : InstRW<[V3AEWrite_7c_9SA_9V], (instregex "^ST3[BHWD]_IMM$")>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[V3AEWrite_7c_9SA_9I_9V], (instregex "^ST3[BHWD]$")>; + +// Contiguous store four structures from four vectors, scalar + imm +def : InstRW<[V3AEWrite_11c_18SA_18V], (instregex "^ST4[BHWD]_IMM$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[V3AEWrite_11c_18SA_18I_18V], (instregex "^ST4[BHWD]$")>; + +// Non temporal store, scalar + imm +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BHWD]_ZRI$")>; + +// Non temporal store, scalar + scalar +def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instrs STNT1H_ZRR)>; +def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BWD]_ZRR$")>; + +// Scatter non temporal store, vector + scalar 32-bit element size +def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^STNT1[BHW]_ZZR_S")>; + +// Scatter non temporal store, vector + scalar 64-bit element size +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^STNT1[BHWD]_ZZR_D")>; + +// Scatter store vector + imm 32-bit element size +def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_IMM$", + "^SST1W_IMM$")>; + +// Scatter store vector + imm 64-bit element size +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_IMM$", + "^SST1D_IMM$")>; + +// Scatter store, 32-bit scaled offset +def : InstRW<[V3AEWrite_4c_4SA_4V], + (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unpacked unscaled offset +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_[SU]XTW$", + "^SST1D_[SU]XTW$")>; + +// Scatter store, 32-bit unpacked scaled offset +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", + "^SST1D_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unscaled offset +def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_[SU]XTW$", + "^SST1W_[SU]XTW$")>; + +// Scatter store, 64-bit scaled offset +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_SCALED$", + "^SST1D_SCALED$")>; + +// Scatter store, 64-bit unscaled offset +def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D$", + "^SST1D$")>; + +// §3.30 SVE Miscellaneous instructions +// ----------------------------------------------------------------------------- + +// Read first fault register, unpredicated +def : InstRW<[V3AEWrite_2c_1M0], (instrs RDFFR_P)>; + +// Read first fault register, predicated +def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFR_PPz)>; + +// Read first fault register and set flags +def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFRS_PPz)>; + +// Set first fault register +// Write to first fault register +def : InstRW<[V3AEWrite_2c_1M0], (instrs SETFFR, WRFFR)>; + +// Prefetch +// NOTE: This is not specified in the SOG. +def : InstRW<[V3AEWrite_4c_1L], (instregex "^PRF[BHWD]")>; + +// §3.31 SVE Cryptographic instructions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]_ZZZ_B$", + "^AESI?MC_ZZ_B$")>; + +// Crypto SHA3 ops +def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$", + "^RAX1_ZZZ_D$", + "^XAR_ZZZI_[BHSD]$")>; + +// Crypto SM4 ops +def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>; + +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 5b80b08..068954f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -764,8 +764,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { } void AArch64PassConfig::addMachineSSAOptimization() { - if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None) - addPass(createMachineSMEABIPass()); + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableNewSMEABILowering) + addPass(createMachineSMEABIPass(TM->getOptLevel())); if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt) addPass(createSMEPeepholeOptPass()); @@ -798,7 +798,7 @@ bool AArch64PassConfig::addILPOpts() { void AArch64PassConfig::addPreRegAlloc() { if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering) - addPass(createMachineSMEABIPass()); + addPass(createMachineSMEABIPass(CodeGenOptLevel::None)); // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOptLevel::None && diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 434ea67..7cb5003 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -121,8 +121,10 @@ struct InstInfo { /// Contains the needed ZA state for each instruction in a block. Instructions /// that do not require a ZA state are not recorded. struct BlockInfo { - ZAState FixedEntryState{ZAState::ANY}; SmallVector<InstInfo> Insts; + ZAState FixedEntryState{ZAState::ANY}; + ZAState DesiredIncomingState{ZAState::ANY}; + ZAState DesiredOutgoingState{ZAState::ANY}; LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -175,10 +177,15 @@ private: Register AgnosticZABufferPtr = AArch64::NoRegister; }; +/// Checks if \p State is a legal edge bundle state. For a state to be a legal +/// bundle state, it must be possible to transition from it to any other bundle +/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED, +/// as you can transition between those states by saving/restoring ZA. The OFF +/// state would not be legal, as transitioning to it drops the content of ZA. static bool isLegalEdgeBundleZAState(ZAState State) { switch (State) { - case ZAState::ACTIVE: - case ZAState::LOCAL_SAVED: + case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. + case ZAState::LOCAL_SAVED: // ZA state is saved on the stack. return true; default: return false; @@ -238,7 +245,8 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, struct MachineSMEABI : public MachineFunctionPass { inline static char ID = 0; - MachineSMEABI() : MachineFunctionPass(ID) {} + MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default) + : MachineFunctionPass(ID), OptLevel(OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -267,6 +275,11 @@ struct MachineSMEABI : public MachineFunctionPass { const EdgeBundles &Bundles, ArrayRef<ZAState> BundleStates); + /// Propagates desired states forwards (from predecessors -> successors) if + /// \p Forwards, otherwise, propagates backwards (from successors -> + /// predecessors). + void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true); + // Emission routines for private and shared ZA functions (using lazy saves). void emitNewZAPrologue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); @@ -335,12 +348,15 @@ struct MachineSMEABI : public MachineFunctionPass { MachineBasicBlock::iterator MBBI, DebugLoc DL); private: + CodeGenOptLevel OptLevel = CodeGenOptLevel::Default; + MachineFunction *MF = nullptr; const AArch64Subtarget *Subtarget = nullptr; const AArch64RegisterInfo *TRI = nullptr; const AArch64FunctionInfo *AFI = nullptr; const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + MachineLoopInfo *MLI = nullptr; }; static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) { @@ -422,12 +438,69 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // Reverse vector (as we had to iterate backwards for liveness). std::reverse(Block.Insts.begin(), Block.Insts.end()); + + // Record the desired states on entry/exit of this block. These are the + // states that would not incur a state transition. + if (!Block.Insts.empty()) { + Block.DesiredIncomingState = Block.Insts.front().NeededState; + Block.DesiredOutgoingState = Block.Insts.back().NeededState; + } } return FunctionInfo{std::move(Blocks), AfterSMEProloguePt, PhysLiveRegsAfterSMEPrologue}; } +void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo, + bool Forwards) { + // If `Forwards`, this propagates desired states from predecessors to + // successors, otherwise, this propagates states from successors to + // predecessors. + auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & { + return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState; + }; + + SmallVector<MachineBasicBlock *> Worklist; + for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) { + if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards))) + Worklist.push_back(MF->getBlockNumbered(BlockID)); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()]; + + // Pick a legal edge bundle state that matches the majority of + // predecessors/successors. + int StateCounts[ZAState::NUM_ZA_STATE] = {0}; + for (MachineBasicBlock *PredOrSucc : + Forwards ? predecessors(MBB) : successors(MBB)) { + BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()]; + ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards); + if (isLegalEdgeBundleZAState(ZAState)) + StateCounts[ZAState]++; + } + + ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts); + ZAState &CurrentState = GetBlockState(Block, Forwards); + if (PropagatedState != CurrentState) { + CurrentState = PropagatedState; + ZAState &OtherState = GetBlockState(Block, !Forwards); + // Propagate to the incoming/outgoing state if that is also "ANY". + if (OtherState == ZAState::ANY) + OtherState = PropagatedState; + // Push any successors/predecessors that may need updating to the + // worklist. + for (MachineBasicBlock *SuccOrPred : + Forwards ? successors(MBB) : predecessors(MBB)) { + BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()]; + if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards))) + Worklist.push_back(SuccOrPred); + } + } + } +} + /// Assigns each edge bundle a ZA state based on the needed states of blocks /// that have incoming or outgoing edges in that bundle. SmallVector<ZAState> @@ -440,40 +513,36 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles, // Attempt to assign a ZA state for this bundle that minimizes state // transitions. Edges within loops are given a higher weight as we assume // they will be executed more than once. - // TODO: We should propagate desired incoming/outgoing states through blocks - // that have the "ANY" state first to make better global decisions. int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; for (unsigned BlockID : Bundles.getBlocks(I)) { LLVM_DEBUG(dbgs() << "- bb." << BlockID); const BlockInfo &Block = FnInfo.Blocks[BlockID]; - if (Block.Insts.empty()) { - LLVM_DEBUG(dbgs() << " (no state preference)\n"); - continue; - } bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I; bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I; - ZAState DesiredIncomingState = Block.Insts.front().NeededState; - if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { - EdgeStateCounts[DesiredIncomingState]++; + bool LegalInEdge = + InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState); + bool LegalOutEgde = + OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState); + if (LegalInEdge) { LLVM_DEBUG(dbgs() << " DesiredIncomingState: " - << getZAStateString(DesiredIncomingState)); + << getZAStateString(Block.DesiredIncomingState)); + EdgeStateCounts[Block.DesiredIncomingState]++; } - ZAState DesiredOutgoingState = Block.Insts.back().NeededState; - if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { - EdgeStateCounts[DesiredOutgoingState]++; + if (LegalOutEgde) { LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " - << getZAStateString(DesiredOutgoingState)); + << getZAStateString(Block.DesiredOutgoingState)); + EdgeStateCounts[Block.DesiredOutgoingState]++; } + if (!LegalInEdge && !LegalOutEgde) + LLVM_DEBUG(dbgs() << " (no state preference)"); LLVM_DEBUG(dbgs() << '\n'); } ZAState BundleState = ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); - // Force ZA to be active in bundles that don't have a preferred state. - // TODO: Something better here (to avoid extra mode switches). if (BundleState == ZAState::ANY) BundleState = ZAState::ACTIVE; @@ -918,6 +987,43 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs); + + if (OptLevel != CodeGenOptLevel::None) { + // Propagate desired states forward, then backwards. Most of the propagation + // should be done in the forward step, and backwards propagation is then + // used to fill in the gaps. Note: Doing both in one step can give poor + // results. For example, consider this subgraph: + // + // ┌─────┐ + // ┌─┤ BB0 ◄───┐ + // │ └─┬───┘ │ + // │ ┌─▼───◄──┐│ + // │ │ BB1 │ ││ + // │ └─┬┬──┘ ││ + // │ │└─────┘│ + // │ ┌─▼───┐ │ + // │ │ BB2 ├───┘ + // │ └─┬───┘ + // │ ┌─▼───┐ + // └─► BB3 │ + // └─────┘ + // + // If: + // - "BB0" and "BB2" (outer loop) has no state preference + // - "BB1" (inner loop) desires the ACTIVE state on entry/exit + // - "BB3" desires the LOCAL_SAVED state on entry + // + // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2, + // then from BB2 to BB0. Which results in the inner and outer loops having + // the "ACTIVE" state. This avoids any state changes in the loops. + // + // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from + // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED + // in the outer loop. + for (bool Forwards : {true, false}) + propagateDesiredStates(FnInfo, Forwards); + } + SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo); EmitContext Context; @@ -941,4 +1047,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { return true; } -FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); } +FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) { + return new MachineSMEABI(OptLevel); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5..cd8b249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,9 +562,13 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cb..529da8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 97c2c9c..9ce1224 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074ea..bf6f1a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6214f4d..75a94ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8..65e6ed9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap<const Value *, bool> Tracker; - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + + switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { default: continue; } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast<IntrinsicInst>(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult<UniformityInfoAnalysis>(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<UniformityInfoAnalysis>(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e..52cc4ca 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; + + struct StateType { + SmallSet<Register, 2> HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector<const MachineInstr *> WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } + + assert(!HazardDef); + HazardDef = &Op; + } - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + 0); + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector<Register, 2> Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits<int>::max()) + if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) + : AMDGPU::DepCtr::encodeFieldVaSdst(0)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector<MachineInstr *> ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) + continue; + Found++; + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + ToErase.push_back(WaitMI); + } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742..d950131 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { } class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, class MIMG_Atomic_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, - int num_addrs, string renamed, bit enableDisasm = 0> - : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, + int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0> + : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX12", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, - string renamed = ""> { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { + defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>; + defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>; } multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, @@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in { class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I> + : ImageDimIntrinsicInfo<I> { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo<intr>; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo<intr>; +} + // L to LZ Optimization Mapping def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be42291..b34ab2a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d80a6f3..a6c1af2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore( } } + Register FinalValueReg = ValueReg; + if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { + // If we are loading 16-bit value with SRAMECC endabled we need a temp + // 32-bit VGPR to load and extract 16-bits into the final register. + ValueReg = + RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); + SubReg = ValueReg; + IsKill = false; + } + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); MachineMemOperand *NewMMO = MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, @@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); + if (FinalValueReg != ValueReg) { + // Extract 16-bit from the loaded 32-bit value. + ValueReg = getSubReg(ValueReg, AMDGPU::lo16); + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) + .addReg(FinalValueReg, getDefRegState(true)) + .addImm(0) + .addReg(ValueReg, getKillRegState(true)) + .addImm(0); + ValueReg = FinalValueReg; + } + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); - Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; + Opc = ST.d16PreservesUnusedBits() + ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 + : AMDGPU::SCRATCH_LOAD_USHORT_SADDR; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 313ae3d..a4d3d62 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - if (!Subtarget->hasVFP2Base()) + if (!Subtarget->hasVFP2Base()) { setAllExpand(MVT::f32); - if (!Subtarget->hasFP64()) + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f32, Legal); + } + if (!Subtarget->hasFP64()) { setAllExpand(MVT::f64); + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f64, Legal); + } } if (Subtarget->hasFullFP16()) { @@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall); } // Strict floating-point comparisons need custom lowering. @@ -1298,12 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - // Use __sincos_stret if available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -1337,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } // FP16 often need to be promoted to call lib functions + // clang-format off if (Subtarget->hasFullFP16()) { - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FTAN, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FEXP10, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::LROUND, MVT::f16, Expand); - - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f16, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS, + ISD::FASIN, ISD::FATAN, ISD::FATAN2, + ISD::FCOSH, ISD::FSINH, ISD::FTANH, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN, + ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH, + ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, + ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, + ISD::STRICT_FTAN}) { + setOperationAction(Op, MVT::f16, Promote); + } + + // Round-to-integer need custom lowering for fp16, as Promote doesn't work + // because the result type is integer. + for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) + setOperationAction(Op, MVT::f16, Custom); + + for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, + ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR, + ISD::FCEIL, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FTRUNC, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, + ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL}) { + setOperationAction(Op, MVT::f16, Legal); + } + // clang-format on } if (Subtarget->hasNEON()) { @@ -9835,13 +9856,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin()); - // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + assert(Subtarget->isTargetDarwin()); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -9871,11 +9897,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Args.emplace_back(Arg, ArgTy); - RTLIB::Libcall LC = - (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - CallingConv::ID CC = getLibcallCallingConv(LC); - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); + StringRef LibcallName = getLibcallImplName(SincosStret); + CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); + SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) @@ -10726,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerCMP(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: { + assert((Op.getOperand(1).getValueType() == MVT::f16 || + Op.getOperand(1).getValueType() == MVT::bf16) && + "Expected custom lowering of rounding operations only for f16"); + SDLoc DL(Op); + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } } } diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 10d4cd5..f7176a6 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>; // An 'fmul' node with a single use. let HasOneUse = 1 in -def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>; +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>; // An 'fadd' node which checks for single non-hazardous use. -def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. -def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 6771106..e2cc97b 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]>{ // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, + [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>, + [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, + [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst<string op, bits<2> opc, int CC> { @@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>; defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; -multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { +multiclass vmaxmin_inst<string op, bit opc, PatFrags SD> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, @@ -610,8 +610,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { } } -defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; -defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; +defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>; +defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", - [(set DPR:$Dd, (fpextend SPR:$Sm))]>, + [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Dd; @@ -766,7 +766,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", - [(set SPR:$Sd, (fpround DPR:$Dm))]>, + [(set SPR:$Sd, (any_fpround DPR:$Dm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Sd; @@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), +def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))), (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f16 (fpround SPR:$Sm)), +def : FP16Pat<(f16 (any_fpround SPR:$Sm)), (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; -def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; @@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))), +def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))), (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), @@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f16 (fpround DPR:$Dm)), +def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)), (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), @@ -1007,41 +1007,41 @@ multiclass vcvt_inst<string opc, bits<2> rm, let Predicates = [HasFPARMv8] in { let Predicates = [HasFullFP16] in { - def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)), GPR)>; } - def : Pat<(i32 (fp_to_sint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_sint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SS") SPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_uint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"US") SPR:$a), GPR)>; } let Predicates = [HasFPARMv8, HasDPVFP] in { - def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SD") DPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UD") DPR:$a), GPR)>; } } -defm VCVTA : vcvt_inst<"a", 0b00, fround>; +defm VCVTA : vcvt_inst<"a", 0b00, any_fround>; defm VCVTN : vcvt_inst<"n", 0b01>; -defm VCVTP : vcvt_inst<"p", 0b10, fceil>; -defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; +defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>; +defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>; def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$Dd), (ins DPR:$Dm), @@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm<string opc, bits<2> rm, SDPatternOperator node = null_frag> { @@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>; -defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; -defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; -defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>; +defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>; +defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>; +defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>; + let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", - [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", - [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { @@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (sint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; let mayRaiseFPException = 1 in @@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (uint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)), (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; // FP -> Int: @@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; @@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), @@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; @@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; @@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), @@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; @@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma x, y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma (fneg x), y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fneg (fma x, y, z)) -> (vfnma z, x, y) -def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), +def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), +def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), +def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fma x, y, (fneg z)) -> (vfnms z, x, y)) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) -def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), +def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), +def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), +def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 44c4830..7ae500a 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>, IntrinArgI8<SignedOpKind_Unsigned> ]>, + IntrinSelect<int_dx_wave_reduce_min, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Signed> + ]>, + IntrinSelect<int_dx_wave_reduce_umin, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Unsigned> + ]>, ]; let arguments = [OverloadTy, Int8Ty, Int8Ty]; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index e46a393..8720460 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -904,6 +904,8 @@ public: case Intrinsic::dx_resource_casthandle: // NOTE: llvm.dbg.value is supported as is in DXIL. case Intrinsic::dbg_value: + // NOTE: llvm.assume is supported as is in DXIL. + case Intrinsic::assume: case Intrinsic::not_intrinsic: if (F.use_empty()) F.eraseFromParent(); diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index e7e7f2c..ce6e812 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) { case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_wave_reduce_max: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_min: + case Intrinsic::dx_wave_reduce_umin: return true; } } diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 1e4797b..cf8b833 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -36,9 +36,10 @@ using namespace llvm; using namespace llvm::dxil; namespace { -/// A simple Wrapper DiagnosticInfo that generates Module-level diagnostic -/// for TranslateMetadata pass -class DiagnosticInfoTranslateMD : public DiagnosticInfo { + +/// A simple wrapper of DiagnosticInfo that generates module-level diagnostic +/// for the DXILValidateMetadata pass +class DiagnosticInfoValidateMD : public DiagnosticInfo { private: const Twine &Msg; const Module &Mod; @@ -47,9 +48,9 @@ public: /// \p M is the module for which the diagnostic is being emitted. \p Msg is /// the message to show. Note that this class does not copy this message, so /// this reference must be valid for the whole life time of the diagnostic. - DiagnosticInfoTranslateMD(const Module &M, - const Twine &Msg LLVM_LIFETIME_BOUND, - DiagnosticSeverity Severity = DS_Error) + DiagnosticInfoValidateMD(const Module &M, + const Twine &Msg LLVM_LIFETIME_BOUND, + DiagnosticSeverity Severity = DS_Error) : DiagnosticInfo(DK_Unsupported, Severity), Msg(Msg), Mod(M) {} void print(DiagnosticPrinter &DP) const override { @@ -57,6 +58,16 @@ public: } }; +static void reportError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + M.getContext().diagnose(DiagnosticInfoValidateMD(M, Message, Severity)); +} + +static void reportLoopError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + reportError(M, Twine("Invalid \"llvm.loop\" metadata: ") + Message, Severity); +} + enum class EntryPropsTag { ShaderFlags = 0, GSState, @@ -314,25 +325,122 @@ static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); } -static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { +// Determines if the metadata node will be compatible with DXIL's loop metadata +// representation. +// +// Reports an error for compatible metadata that is ill-formed. +static bool isLoopMDCompatible(Module &M, Metadata *MD) { + // DXIL only accepts the following loop hints: + std::array<StringLiteral, 3> ValidHintNames = {"llvm.loop.unroll.count", + "llvm.loop.unroll.disable", + "llvm.loop.unroll.full"}; + + MDNode *HintMD = dyn_cast<MDNode>(MD); + if (!HintMD || HintMD->getNumOperands() == 0) + return false; + + auto *HintStr = dyn_cast<MDString>(HintMD->getOperand(0)); + if (!HintStr) + return false; + + if (!llvm::is_contained(ValidHintNames, HintStr->getString())) + return false; + + auto ValidCountNode = [](MDNode *CountMD) -> bool { + if (CountMD->getNumOperands() == 2) + if (auto *Count = dyn_cast<ConstantAsMetadata>(CountMD->getOperand(1))) + if (isa<ConstantInt>(Count->getValue())) + return true; + return false; + }; + + if (HintStr->getString() == "llvm.loop.unroll.count") { + if (!ValidCountNode(HintMD)) { + reportLoopError(M, "\"llvm.loop.unroll.count\" must have 2 operands and " + "the second must be a constant integer"); + return false; + } + } else if (HintMD->getNumOperands() != 1) { + reportLoopError( + M, "\"llvm.loop.unroll.disable\" and \"llvm.loop.unroll.full\" " + "must be provided as a single operand"); + return false; + } + + return true; +} + +static void translateLoopMetadata(Module &M, Instruction *I, MDNode *BaseMD) { + // A distinct node has the self-referential form: !0 = !{ !0, ... } + auto IsDistinctNode = [](MDNode *Node) -> bool { + return Node && Node->getNumOperands() != 0 && Node == Node->getOperand(0); + }; + + // Set metadata to null to remove empty/ill-formed metadata from instruction + if (BaseMD->getNumOperands() == 0 || !IsDistinctNode(BaseMD)) + return I->setMetadata("llvm.loop", nullptr); + + // It is valid to have a chain of self-refential loop metadata nodes, as + // below. We will collapse these into just one when we reconstruct the + // metadata. + // + // Eg: + // !0 = !{!0, !1} + // !1 = !{!1, !2} + // !2 = !{!"llvm.loop.unroll.disable"} + // + // So, traverse down a potential self-referential chain + while (1 < BaseMD->getNumOperands() && + IsDistinctNode(dyn_cast<MDNode>(BaseMD->getOperand(1)))) + BaseMD = dyn_cast<MDNode>(BaseMD->getOperand(1)); + + // To reconstruct a distinct node we create a temporary node that we will + // then update to create a self-reference. + llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(M.getContext(), {}); + SmallVector<Metadata *> CompatibleOperands = {TempNode.get()}; + + // Iterate and reconstruct the metadata nodes that contains any hints, + // stripping any unrecognized metadata. + ArrayRef<MDOperand> Operands = BaseMD->operands(); + for (auto &Op : Operands.drop_front()) + if (isLoopMDCompatible(M, Op.get())) + CompatibleOperands.push_back(Op.get()); + + if (2 < CompatibleOperands.size()) + reportLoopError(M, "Provided conflicting hints"); + + MDNode *CompatibleLoopMD = MDNode::get(M.getContext(), CompatibleOperands); + TempNode->replaceAllUsesWith(CompatibleLoopMD); + + I->setMetadata("llvm.loop", CompatibleLoopMD); +} + +using InstructionMDList = std::array<unsigned, 7>; + +static InstructionMDList getCompatibleInstructionMDs(llvm::Module &M) { return { M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"), M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range, - llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; + llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias, + M.getMDKindID("llvm.loop")}; } static void translateInstructionMetadata(Module &M) { // construct allowlist of valid metadata node kinds - std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); + InstructionMDList DXILCompatibleMDs = getCompatibleInstructionMDs(M); + unsigned char MDLoopKind = M.getContext().getMDKindID("llvm.loop"); for (Function &F : M) { for (BasicBlock &BB : F) { // This needs to be done first so that "hlsl.controlflow.hints" isn't - // removed in the whitelist below + // removed in the allow-list below if (auto *I = BB.getTerminator()) translateBranchMetadata(M, I); for (auto &I : make_early_inc_range(BB)) { + if (isa<BranchInst>(I)) + if (MDNode *LoopMD = I.getMetadata(MDLoopKind)) + translateLoopMetadata(M, &I, LoopMD); I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); } } @@ -364,6 +472,16 @@ static void cleanModuleFlags(Module &M) { M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); } +using GlobalMDList = std::array<StringLiteral, 7>; + +// The following are compatible with DXIL but not emit with clang, they can +// be added when applicable: +// dx.typeAnnotations, dx.viewIDState, dx.dxrPayloadAnnotations +static GlobalMDList CompatibleNamedModuleMDs = { + "llvm.ident", "llvm.module.flags", "dx.resources", "dx.valver", + "dx.shaderModel", "dx.version", "dx.entryPoints", +}; + static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM, const ModuleShaderFlags &ShaderFlags, @@ -389,31 +507,23 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, uint64_t CombinedMask = ShaderFlags.getCombinedFlags(); EntryFnMDNodes.emplace_back( emitTopLevelLibraryNode(M, ResourceMD, CombinedMask)); - } else if (MMDI.EntryPropertyVec.size() > 1) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, "Non-library shader: One and only one entry expected")); - } + } else if (1 < MMDI.EntryPropertyVec.size()) + reportError(M, "Non-library shader: One and only one entry expected"); for (const EntryProperties &EntryProp : MMDI.EntryPropertyVec) { - const ComputedShaderFlags &EntrySFMask = - ShaderFlags.getFunctionFlags(EntryProp.Entry); - - // If ShaderProfile is Library, mask is already consolidated in the - // top-level library node. Hence it is not emitted. uint64_t EntryShaderFlags = 0; if (MMDI.ShaderProfile != Triple::EnvironmentType::Library) { - EntryShaderFlags = EntrySFMask; - if (EntryProp.ShaderStage != MMDI.ShaderProfile) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, - "Shader stage '" + - Twine(getShortShaderStage(EntryProp.ShaderStage) + - "' for entry '" + Twine(EntryProp.Entry->getName()) + - "' different from specified target profile '" + - Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + - "'")))); - } + EntryShaderFlags = ShaderFlags.getFunctionFlags(EntryProp.Entry); + if (EntryProp.ShaderStage != MMDI.ShaderProfile) + reportError( + M, "Shader stage '" + + Twine(getShortShaderStage(EntryProp.ShaderStage)) + + "' for entry '" + Twine(EntryProp.Entry->getName()) + + "' different from specified target profile '" + + Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + + "'")); } + EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI.ShaderProfile)); @@ -426,19 +536,17 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, cleanModuleFlags(M); - // dx.rootsignatures will have been parsed from its metadata form as its - // binary form as part of the RootSignatureAnalysisWrapper, so safely - // remove it as it is not recognized in DXIL - if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) - RootSignature->eraseFromParent(); + // Finally, strip all module metadata that is not explicitly specified in the + // allow-list + SmallVector<NamedMDNode *> ToStrip; - // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and - // causes all tests using the DXIL Validator to fail. - // - // This is a temporary fix and should be replaced with a allowlist once - // we have determined all metadata that the DXIL Validator allows - if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) - ErrNo->eraseFromParent(); + for (NamedMDNode &NamedMD : M.named_metadata()) + if (!NamedMD.getName().starts_with("llvm.dbg.") && + !llvm::is_contained(CompatibleNamedModuleMDs, NamedMD.getName())) + ToStrip.push_back(&NamedMD); + + for (NamedMDNode *NamedMD : ToStrip) + NamedMD->eraseFromParent(); } PreservedAnalyses DXILTranslateMetadata::run(Module &M, @@ -454,45 +562,34 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, return PreservedAnalyses::all(); } -namespace { -class DXILTranslateMetadataLegacy : public ModulePass { -public: - static char ID; // Pass identification, replacement for typeid - explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} - - StringRef getPassName() const override { return "DXIL Translate Metadata"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DXILResourceTypeWrapperPass>(); - AU.addRequired<DXILResourceWrapperPass>(); - AU.addRequired<ShaderFlagsAnalysisWrapper>(); - AU.addRequired<DXILMetadataAnalysisWrapperPass>(); - AU.addRequired<RootSignatureAnalysisWrapper>(); - - AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); - AU.addPreserved<DXILResourceBindingWrapperPass>(); - AU.addPreserved<DXILResourceWrapperPass>(); - AU.addPreserved<RootSignatureAnalysisWrapper>(); - AU.addPreserved<ShaderFlagsAnalysisWrapper>(); - } +void DXILTranslateMetadataLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DXILResourceTypeWrapperPass>(); + AU.addRequired<DXILResourceWrapperPass>(); + AU.addRequired<ShaderFlagsAnalysisWrapper>(); + AU.addRequired<DXILMetadataAnalysisWrapperPass>(); + AU.addRequired<RootSignatureAnalysisWrapper>(); + + AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); + AU.addPreserved<DXILResourceBindingWrapperPass>(); + AU.addPreserved<DXILResourceWrapperPass>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); + AU.addPreserved<ShaderFlagsAnalysisWrapper>(); +} - bool runOnModule(Module &M) override { - DXILResourceMap &DRM = - getAnalysis<DXILResourceWrapperPass>().getResourceMap(); - DXILResourceTypeMap &DRTM = - getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); - const ModuleShaderFlags &ShaderFlags = - getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags(); - dxil::ModuleMetadataInfo MMDI = - getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); - - translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); - translateInstructionMetadata(M); - return true; - } -}; +bool DXILTranslateMetadataLegacy::runOnModule(Module &M) { + DXILResourceMap &DRM = + getAnalysis<DXILResourceWrapperPass>().getResourceMap(); + DXILResourceTypeMap &DRTM = + getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); + const ModuleShaderFlags &ShaderFlags = + getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags(); + dxil::ModuleMetadataInfo MMDI = + getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); -} // namespace + translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); + translateInstructionMetadata(M); + return true; +} char DXILTranslateMetadataLegacy::ID = 0; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index 4c1ffac..cfb8aaa 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -10,6 +10,7 @@ #define LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H #include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" namespace llvm { @@ -20,6 +21,22 @@ public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &); }; +/// Wrapper pass for the legacy pass manager. +/// +/// This is required because the passes that will depend on this are codegen +/// passes which run through the legacy pass manager. +class DXILTranslateMetadataLegacy : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} + + StringRef getPassName() const override { return "DXIL Translate Metadata"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnModule(Module &M) override; +}; + } // namespace llvm #endif // LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 68fd3e0..60dfd96 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_splitdouble: case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: + case Intrinsic::dx_wave_reduce_min: case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_umin: case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_imad: case Intrinsic::dx_umad: diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 54c8972..0573f64 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, SDValue W0 = isUndef(PredV) ? DAG.getUNDEF(MVT::i64) : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV); - Words[IdxW].push_back(HiHalf(W0, DAG)); - Words[IdxW].push_back(LoHalf(W0, DAG)); + if (Bytes < BitBytes) { + Words[IdxW].push_back(HiHalf(W0, DAG)); + Words[IdxW].push_back(LoHalf(W0, DAG)); + } else + Words[IdxW].push_back(W0); while (Bytes < BitBytes) { IdxW ^= 1; @@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, Bytes *= 2; } + while (Bytes > BitBytes) { + IdxW ^= 1; + Words[IdxW].clear(); + + if (Bytes <= 4) { + for (const SDValue &W : Words[IdxW ^ 1]) { + SDValue T = contractPredicate(W, dl, DAG); + Words[IdxW].push_back(T); + } + } else { + for (const SDValue &W : Words[IdxW ^ 1]) { + Words[IdxW].push_back(W); + } + } + Bytes /= 2; + } + assert(Bytes == BitBytes); + if (BitBytes == 1 && PredTy == MVT::v2i1) + ByteTy = MVT::getVectorVT(MVT::i16, HwLen); SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy); SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32); @@ -3157,6 +3179,9 @@ SDValue HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { auto *MemN = cast<MemSDNode>(Op.getNode()); + if (!MemN->getMemoryVT().isSimple()) + return Op; + MVT MemTy = MemN->getMemoryVT().getSimpleVT(); if (!isHvxPairTy(MemTy)) return Op; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 4029e14..729c077 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U, // predicate ("@"). return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) || - AsmInst.find(".pragma") != StringRef::npos); + AsmInst.contains(".pragma")); }); return InstCount * TargetTransformInfo::TCC_Basic; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 17f04d0..20fc849 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries( "ppc-min-jump-table-entries", cl::init(64), cl::Hidden, cl::desc("Set minimum number of entries to use a jump table on PPC")); +static cl::opt<unsigned> PPCMinimumBitTestCmps( + "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons to use bit test for " + "switch on PPC.")); + static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth( "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden, cl::desc("max depth when checking alias info in GatherAllAliases()")); @@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Re-evaluate this value on future HWs that can do better with mtctr. setMinimumJumpTableEntries(PPCMinimumJumpTableEntries); + // The default minimum of largest number in a BitTest cluster is 3. + setMinimumBitTestCmps(PPCMinimumBitTestCmps); + setMinFunctionAlignment(Align(4)); setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32); diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index b0bed71c..da3efdc 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = XT{5}; } +class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opCode, OOL, IOL, asmstr, itin> { + + bits<5> RB; + bits<5> RS; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11...15} = 0; + let Inst{16...20} = RB; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> : I<59, OOL, IOL, asmstr, NoItinerary> { @@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in { def TLBIEIO : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC), "tlbieio $RB, $RS, $RIC", []>; + def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def TLBIEP8 : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, u1imm:$PRS, u1imm:$R), "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64; } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 41a9c92..96e8afc 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) { break; case RISCV::fixup_riscv_rvc_jump: case RISCV::fixup_riscv_rvc_branch: + case RISCV::fixup_riscv_rvc_imm: case RISCV::fixup_riscv_jal: return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 6d587e6..5934c91 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCI) { FixupKind = RISCV::fixup_riscv_rvc_imm; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili); } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 98b636e..9bd66a4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, .addReg(ScratchReg) .addImm(-1); break; + case AtomicRMWInst::Max: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Min: + BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMax: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) .addReg(ScratchReg) @@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { + // Using MIN(U)/MAX(U) is preferrable if permitted + if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked) + return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 2754d78..b4556f6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature< def HasAtomicLdSt : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; +// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) +// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] +// in section 13.3. Eventual Success of Store-Conditional Instructions, defines +// _constrained_ LR/SC loops: +// The dynamic code executed between the LR and SC instructions can only +// contain instructions from the base ''I'' instruction set, excluding loads, +// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM +// instructions. Compressed forms of the aforementioned ''I'' instructions in +// the Zca and Zcb extensions are also permitted. +// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, +// and success is implementation specific. For implementations which know that +// non-base instructions (such as the ''B'' extension) will not violate any +// forward progress guarantees, using these instructions to reduce the LR/SC +// sequence length is desirable. +def FeaturePermissiveZalrsc + : SubtargetFeature< + "permissive-zalrsc", "HasPermissiveZalrsc", "true", + "Implementation permits non-base instructions between LR/SC pairs">; + def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6181abb..47022b3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( .addDef(ResVReg) .addUse(getSPIRVTypeID(BaseType)) .addImm(static_cast<uint32_t>(Storage)); - if (Init != 0) + if (Init) MIB.addUse(Init->getOperand(0).getReg()); // ISel may introduce a new register on this step, so we need to add it to // DT and correct its type avoiding fails on the next stage. diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 021353a..3fea21e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -222,6 +222,9 @@ private: bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I, + bool IsUnsigned) const { + assert(I.getNumOperands() == 3); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + Register InputRegister = I.getOperand(2).getReg(); + SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister); + + if (!InputType) + report_fatal_error("Input Type could not be determined."); + + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + // Retreive the operation to use based on input type + bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat); + auto IntegerOpcodeType = + IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin; + auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType; + return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII, + !STI.isShader())) + .addImm(SPIRV::GroupOperation::Reduce) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true); case Intrinsic::spv_wave_reduce_max: return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false); + case Intrinsic::spv_wave_reduce_umin: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true); + case Intrinsic::spv_wave_reduce_min: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false); case Intrinsic::spv_wave_reduce_sum: return selectWaveReduceSum(ResVReg, ResType, I); case Intrinsic::spv_wave_readlane: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3da720f..58109ac 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, if (const auto *CB = dyn_cast<CallBase>(RHSVal)) { if (CB->isInlineAsm()) { const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); - return IA && - IA->getConstraintString().find("{@cc}") != std::string::npos; + return IA && IA->getConstraintString().contains("{@cc}"); } } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 410f20e..624cff2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -12216,7 +12213,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); - return (int)ShiftAmt; + return ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just @@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + bool isF64 = ArgVT == MVT::f64; + + RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = TLI.getLibcallName(LC); + if (!LibcallName) + return SDValue(); + assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; Args.emplace_back(Arg, ArgTy); - bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); @@ -48786,6 +48787,11 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } if (CC == X86::COND_E || CC == X86::COND_NE) { + // Canonicalize constant to RHS if we're just using ZF. + if (Op0 != Op1 && DAG.isConstantIntBuildVectorOrConstantInt(Op0) && + !DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op0); + // TESTZ(X,~Y) == TESTC(Y,X) if (SDValue NotOp1 = IsNOT(Op1, DAG)) { CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); @@ -48849,10 +48855,6 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } } - // TESTZ(-1,X) == TESTZ(X,X) - if (ISD::isBuildVectorAllOnes(Op0.getNode())) - return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); - // TESTZ(X,-1) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); @@ -54634,6 +54636,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. @@ -54652,6 +54655,40 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) return V; + // Fold trunc(srl(load(p),amt)) -> load(p+amt/8) + // If we're shifting down byte aligned bit chunks from a larger load for + // truncation, see if we can convert the shift into a pointer offset instead. + // Limit this to normal (non-ext) scalar integer loads. + if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && + Src.hasOneUse() && Src.getOperand(0).hasOneUse() && + ISD::isNormalLoad(Src.getOperand(0).getNode())) { + auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); + if (Ld->isSimple() && VT.isByteSized() && + isPowerOf2_64(VT.getSizeInBits())) { + SDValue ShAmt = Src.getOperand(1); + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + // Check the shift amount is byte aligned. + // Check the truncation doesn't use any shifted in (zero) top bits. + if (KnownAmt.countMinTrailingZeros() >= 3 && + KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - + VT.getSizeInBits())) { + EVT PtrVT = Ld->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); + SDValue PtrByteOfs = + DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset( + Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); + SDValue NewLoad = + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + Align(), Ld->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), + NewLoad.getValue(1)); + return NewLoad; + } + } + } + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { |
