15 files changed, 5686 insertions, 91 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 8d0ff41..1396841 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 FunctionPass *createAArch64CollectLOHPass();
 FunctionPass *createSMEABIPass();
 FunctionPass *createSMEPeepholeOptPass();
-FunctionPass *createMachineSMEABIPass();
+FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index a4529a5..0f457c2 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -133,6 +133,8 @@ include "AArch64SchedNeoverseN2.td"
 include "AArch64SchedNeoverseN3.td"
 include "AArch64SchedNeoverseV1.td"
 include "AArch64SchedNeoverseV2.td"
+include "AArch64SchedNeoverseV3.td"
+include "AArch64SchedNeoverseV3AE.td"
 include "AArch64SchedOryon.td"
 
 include "AArch64Processors.td"
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 137ff89..f13554f 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -47,6 +47,8 @@ public:
   StringRef getPassName() const override { return AARCH64_BRANCH_TARGETS_NAME; }
 
 private:
+  const AArch64Subtarget *Subtarget;
+
   void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump,
               bool NeedsWinCFI);
 };
@@ -75,6 +77,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
                     << "********** Function: " << MF.getName() << '\n');
   const Function &F = MF.getFunction();
 
+  Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+
   // LLVM does not consider basic blocks which are the targets of jump tables
   // to be address-taken (the address can't escape anywhere else), but they are
   // used for indirect branches, so need BTI instructions.
@@ -100,9 +104,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
     // a BTI, and pointing the indirect branch at that. For non-ELF targets we
     // can't rely on that, so we assume that `CouldCall` is _always_ true due
     // to the risk of long-branch thunks at link time.
-    if (&MBB == &*MF.begin() &&
-        (!MF.getSubtarget<AArch64Subtarget>().isTargetELF() ||
-         (F.hasAddressTaken() || !F.hasLocalLinkage())))
+    if (&MBB == &*MF.begin() && (!Subtarget->isTargetELF() ||
+                                 (F.hasAddressTaken() || !F.hasLocalLinkage())))
       CouldCall = true;
 
     // If the block itself is address-taken, it could be indirectly branched
@@ -132,9 +135,6 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
                     << (CouldCall ? "c" : "") << " to " << MBB.getName()
                     << "\n");
 
-  const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
-      MBB.getParent()->getSubtarget().getInstrInfo());
-
   unsigned HintNum = 32;
   if (CouldCall)
     HintNum |= 2;
@@ -162,6 +162,8 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
        MBBI->getOpcode() == AArch64::PACIBSP))
     return;
 
+  const AArch64InstrInfo *TII = Subtarget->getInstrInfo();
+
   // Insert BTI exactly at the first executable instruction.
   const DebugLoc DL = MBB.findDebugLoc(MBBI);
   MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT))
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1e607f4..f63981b 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1871,7 +1871,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
 }
 
 bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
 
   bool Modified = false;
   for (auto &MBB : MF)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c76689f..0f7b34c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -644,10 +644,10 @@ bool AArch64FrameLowering::hasReservedCallFrame(
 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const AArch64TargetLowering *TLI =
-      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64InstrInfo *TII = Subtarget.getInstrInfo();
+  const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
   [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
   DebugLoc DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
@@ -1319,8 +1319,8 @@ StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF,
 // TODO: This function currently does not work for scalable vectors.
 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
                                                  int FI) const {
-  const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const AArch64RegisterInfo *RegInfo =
+      MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
              ? getFPOffset(MF, ObjectOffset).getFixed()
@@ -1343,10 +1343,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
     TargetStackID::Value StackID, Register &FrameReg, bool PreferFP,
     bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
-  const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
 
   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
@@ -1466,7 +1465,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       return FPOffset;
     }
     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
-                                           : (unsigned)AArch64::SP;
+                                           : MCRegister(AArch64::SP);
 
     return SPOffset;
   }
@@ -1589,8 +1588,8 @@ static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
 namespace {
 
 struct RegPairInfo {
-  unsigned Reg1 = AArch64::NoRegister;
-  unsigned Reg2 = AArch64::NoRegister;
+  Register Reg1;
+  Register Reg2;
   int FrameIdx;
   int Offset;
   enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
@@ -1598,21 +1597,21 @@ struct RegPairInfo {
 
   RegPairInfo() = default;
 
-  bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+  bool isPaired() const { return Reg2.isValid(); }
 
   bool isScalable() const { return Type == PPR || Type == ZPR; }
 };
 
 } // end anonymous namespace
 
-unsigned findFreePredicateReg(BitVector &SavedRegs) {
+MCRegister findFreePredicateReg(BitVector &SavedRegs) {
   for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
     if (SavedRegs.test(PReg)) {
       unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
-      return PNReg;
+      return MCRegister(PNReg);
     }
   }
-  return AArch64::NoRegister;
+  return MCRegister();
 }
 
 // The multivector LD/ST are available only for SME or SVE2p1 targets
@@ -1930,8 +1929,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   }
   bool PTrueCreated = false;
   for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
-    unsigned Reg1 = RPI.Reg1;
-    unsigned Reg2 = RPI.Reg2;
+    Register Reg1 = RPI.Reg1;
+    Register Reg2 = RPI.Reg2;
     unsigned StrOpc;
 
     // Issue sequence of spills for cs regs.  The first spill may be converted
@@ -1967,7 +1966,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       break;
     }
 
-    unsigned X0Scratch = AArch64::NoRegister;
+    Register X0Scratch;
     auto RestoreX0 = make_scope_exit([&] {
       if (X0Scratch != AArch64::NoRegister)
         BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0)
@@ -2009,11 +2008,15 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       }
     }
 
-    LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
-               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
-               dbgs() << ") -> fi#(" << RPI.FrameIdx;
-               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
-               dbgs() << ")\n");
+    LLVM_DEBUG({
+      dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
+      if (RPI.isPaired())
+        dbgs() << ", " << printReg(Reg2, TRI);
+      dbgs() << ") -> fi#(" << RPI.FrameIdx;
+      if (RPI.isPaired())
+        dbgs() << ", " << RPI.FrameIdx + 1;
+      dbgs() << ")\n";
+    });
 
     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
            "Windows unwdinding requires a consecutive (FP,LR) pair");
@@ -2143,8 +2146,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   bool PTrueCreated = false;
   for (const RegPairInfo &RPI : RegPairs) {
-    unsigned Reg1 = RPI.Reg1;
-    unsigned Reg2 = RPI.Reg2;
+    Register Reg1 = RPI.Reg1;
+    Register Reg2 = RPI.Reg2;
 
     // Issue sequence of restores for cs regs. The last restore may be converted
     // to a post-increment load later by emitEpilogue if the callee-save stack
@@ -2176,11 +2179,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     case RegPairInfo::VG:
       continue;
     }
-    LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
-               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
-               dbgs() << ") -> fi#(" << RPI.FrameIdx;
-               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
-               dbgs() << ")\n");
+    LLVM_DEBUG({
+      dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
+      if (RPI.isPaired())
+        dbgs() << ", " << printReg(Reg2, TRI);
+      dbgs() << ") -> fi#(" << RPI.FrameIdx;
+      if (RPI.isPaired())
+        dbgs() << ", " << RPI.FrameIdx + 1;
+      dbgs() << ")\n";
+    });
 
     // Windows unwind codes require consecutive registers if registers are
     // paired.  Make the switch here, so that the code below will save (x,x+1)
@@ -2435,8 +2442,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
-  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2444,9 +2450,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
-  unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
-                                ? RegInfo->getBaseRegister()
-                                : (unsigned)AArch64::NoRegister;
+  MCRegister BasePointerReg =
+      RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister();
 
   unsigned ExtraCSSpill = 0;
   bool HasUnpairedGPR64 = false;
@@ -2456,7 +2461,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
-    const unsigned Reg = CSRegs[i];
+    const MCRegister Reg = CSRegs[i];
 
     // Add the base pointer register to SavedRegs if it is callee-save.
     if (Reg == BasePointerReg)
@@ -2470,7 +2475,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
 
     bool RegUsed = SavedRegs.test(Reg);
-    unsigned PairedReg = AArch64::NoRegister;
+    MCRegister PairedReg;
     const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
     if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
         AArch64::FPR128RegClass.contains(Reg)) {
@@ -2522,8 +2527,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     // Find a suitable predicate register for the multi-vector spill/fill
     // instructions.
-    unsigned PnReg = findFreePredicateReg(SavedRegs);
-    if (PnReg != AArch64::NoRegister)
+    MCRegister PnReg = findFreePredicateReg(SavedRegs);
+    if (PnReg.isValid())
       AFI->setPredicateRegForFillSpill(PnReg);
     // If no free callee-save has been found assign one.
     if (!AFI->getPredicateRegForFillSpill() &&
@@ -2558,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned PPRCSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
-    auto *RC = TRI->getMinimalPhysRegClass(Reg);
+    auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg));
     assert(RC && "expected register class!");
     auto SpillSize = TRI->getSpillSize(*RC);
     bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
@@ -2600,7 +2605,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   LLVM_DEBUG({
     dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
     for (unsigned Reg : SavedRegs.set_bits())
-      dbgs() << ' ' << printReg(Reg, RegInfo);
+      dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo);
     dbgs() << "\n";
   });
 
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index d67182d..03dd1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -649,7 +649,7 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
 }
 
 bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
 
   bool Modified = false;
   for (auto &MBB : MF)
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 81f5d07..11387bb 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1272,11 +1272,11 @@ def : ProcessorModel<"cortex-x2", NeoverseV2Model, ProcessorFeatures.X2,
                      [TuneX2]>;
 def : ProcessorModel<"cortex-x3", NeoverseV2Model, ProcessorFeatures.X3,
                      [TuneX3]>;
-def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4,
+def : ProcessorModel<"cortex-x4", NeoverseV3Model, ProcessorFeatures.X4,
                      [TuneX4]>;
-def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925,
+def : ProcessorModel<"cortex-x925", NeoverseV3Model, ProcessorFeatures.X925,
                      [TuneX925]>;
-def : ProcessorModel<"gb10", NeoverseV2Model, ProcessorFeatures.GB10,
+def : ProcessorModel<"gb10", NeoverseV3Model, ProcessorFeatures.GB10,
                      [TuneX925]>;
 def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace,
                      [TuneNeoverseV2]>;
@@ -1295,9 +1295,9 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model,
                      ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
 def : ProcessorModel<"neoverse-v2", NeoverseV2Model,
                      ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>;
-def : ProcessorModel<"neoverse-v3", NeoverseV2Model,
+def : ProcessorModel<"neoverse-v3", NeoverseV3Model,
                      ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>;
-def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model,
+def : ProcessorModel<"neoverse-v3ae", NeoverseV3AEModel,
                      ProcessorFeatures.NeoverseV3AE, [TuneNeoverseV3AE]>;
 def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
                      [TuneExynosM3]>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 79975b0..5bfb19d9 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -620,7 +620,7 @@ AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   return RC;
 }
 
-unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+MCRegister AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
 
 bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 47d76f3..3b0f4f6 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -124,7 +124,7 @@ public:
 
   bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
   bool hasBasePointer(const MachineFunction &MF) const;
-  unsigned getBaseRegister() const;
+  MCRegister getBaseRegister() const;
 
   bool isArgumentRegister(const MachineFunction &MF,
                           MCRegister Reg) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index d695f26..b4a4f4c 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -49,8 +50,8 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Pass.h"
-#include <unordered_map>
 #include <map>
+#include <unordered_map>
 
 using namespace llvm;
 
@@ -67,7 +68,7 @@ namespace {
 struct AArch64SIMDInstrOpt : public MachineFunctionPass {
   static char ID;
 
-  const TargetInstrInfo *TII;
+  const AArch64InstrInfo *TII;
   MachineRegisterInfo *MRI;
   TargetSchedModel SchedModel;
 
@@ -694,13 +695,9 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
-  const TargetSubtargetInfo &ST = MF.getSubtarget();
-  const AArch64InstrInfo *AAII =
-      static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
-  if (!AAII)
-    return false;
+  const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
+  TII = ST.getInstrInfo();
   SchedModel.init(&ST);
   if (!SchedModel.hasInstrSchedModel())
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td
new file mode 100644
index 0000000..e23576a
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td
@@ -0,0 +1,2777 @@
+//=- AArch64SchedNeoverseV3.td - NeoverseV3 Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse V3 processors.
+// All information is taken from the V3 Software Optimization guide:
+//
+// https://developer.arm.com/documentation/109678/300/?lang=en
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseV3Model : SchedMachineModel {
+  let IssueWidth            =  10; // Expect best value to be slightly higher than V2
+  let MicroOpBufferSize     = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
+  let LoopMicroOpBufferSize =  16; // NOTE: Copied from Cortex-A57.
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+                                                    [HasSVE2p1, HasSVEB16B16,
+                                                     HasCPA, HasCSSC]);
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse V3.
+// Instructions are first fetched and then decoded into internal macro-ops
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch
+// stages. A MOP can be split into two micro-ops further down the pipeline
+// after the decode stage. Once dispatched, micro-ops wait for their operands
+// and issue out-of-order to one of twenty-one issue pipelines. Each issue
+// pipeline can accept one micro-op per cycle.
+
+let SchedModel = NeoverseV3Model in {
+
+// Define the (21) issue ports.
+def V3UnitB   : ProcResource<3>;  // Branch 0/1/2
+def V3UnitS0  : ProcResource<1>;  // Integer single-cycle 0
+def V3UnitS1  : ProcResource<1>;  // Integer single-cycle 1
+def V3UnitS2  : ProcResource<1>;  // Integer single-cycle 2
+def V3UnitS3  : ProcResource<1>;  // Integer single-cycle 3
+def V3UnitS4  : ProcResource<1>;  // Integer single-cycle 4
+def V3UnitS5  : ProcResource<1>;  // Integer single-cycle 5
+def V3UnitM0  : ProcResource<1>;  // Integer single/multicycle 0
+def V3UnitM1  : ProcResource<1>;  // Integer single/multicycle 1
+def V3UnitV0  : ProcResource<1>;  // FP/ASIMD 0
+def V3UnitV1  : ProcResource<1>;  // FP/ASIMD 1
+def V3UnitV2  : ProcResource<1>;  // FP/ASIMD 2
+def V3UnitV3  : ProcResource<1>;  // FP/ASIMD 3
+def V3UnitLS0 : ProcResource<1>;  // Load/Store 0
+def V3UnitL12 : ProcResource<2>;  // Load 1/2
+def V3UnitST1 : ProcResource<1>;  // Store 1
+def V3UnitD   : ProcResource<2>;  // Store data 0/1
+def V3UnitFlg : ProcResource<4>;  // Flags
+
+def V3UnitS   : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5]>;  // Integer single-cycle 0/1/2/3/4/5
+def V3UnitI   : ProcResGroup<[V3UnitS0, V3UnitS1, V3UnitS2, V3UnitS3, V3UnitS4, V3UnitS5, V3UnitM0, V3UnitM1]>;  // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1
+def V3UnitM   : ProcResGroup<[V3UnitM0, V3UnitM1]>;  // Integer single/multicycle 0/1
+def V3UnitLSA : ProcResGroup<[V3UnitLS0, V3UnitL12, V3UnitST1]>; // Supergroup of L+SA
+def V3UnitL   : ProcResGroup<[V3UnitLS0, V3UnitL12]>; // Load/Store 0 and Load 1/2
+def V3UnitSA  : ProcResGroup<[V3UnitLS0, V3UnitST1]>; // Load/Store 0 and Store 1
+def V3UnitV   : ProcResGroup<[V3UnitV0, V3UnitV1, V3UnitV2, V3UnitV3]>;  // FP/ASIMD 0/1/2/3
+def V3UnitV01 : ProcResGroup<[V3UnitV0, V3UnitV1]>;  // FP/ASIMD 0/1
+def V3UnitV02 : ProcResGroup<[V3UnitV0, V3UnitV2]>;  // FP/ASIMD 0/2
+def V3UnitV13 : ProcResGroup<[V3UnitV1, V3UnitV3]>;  // FP/ASIMD 1/3
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+// NOTE: Copied from N2.
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse V3.
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 0 micro-op types
+def V3Write_0c : SchedWriteRes<[]> { let Latency = 0; }
+
+// Define generic 1 micro-op types
+
+def V3Write_1c_1B    : SchedWriteRes<[V3UnitB]>   { let Latency = 1; }
+def V3Write_1c_1F_1Flg : SchedWriteRes<[V3UnitI, V3UnitFlg]>   { let Latency = 1; }
+def V3Write_1c_1I    : SchedWriteRes<[V3UnitI]>   { let Latency = 1; }
+def V3Write_1c_1M    : SchedWriteRes<[V3UnitM]>   { let Latency = 1; }
+def V3Write_1c_1SA   : SchedWriteRes<[V3UnitSA]>  { let Latency = 1; }
+def V3Write_2c_1M    : SchedWriteRes<[V3UnitM]>   { let Latency = 2; }
+def V3Write_2c_1M_1Flg : SchedWriteRes<[V3UnitM, V3UnitFlg]>   { let Latency = 2; }
+def V3Write_3c_1M    : SchedWriteRes<[V3UnitM]>   { let Latency = 3; }
+def V3Write_2c_1M0   : SchedWriteRes<[V3UnitM0]>  { let Latency = 2; }
+def V3Write_3c_1M0   : SchedWriteRes<[V3UnitM0]>  { let Latency = 3; }
+def V3Write_4c_1M0   : SchedWriteRes<[V3UnitM0]>  { let Latency = 4; }
+def V3Write_12c_1M0  : SchedWriteRes<[V3UnitM0]>  { let Latency = 12;
+                                                    let ReleaseAtCycles = [12]; }
+def V3Write_20c_1M0  : SchedWriteRes<[V3UnitM0]>  { let Latency = 20;
+                                                    let ReleaseAtCycles = [20]; }
+def V3Write_4c_1L    : SchedWriteRes<[V3UnitL]>   { let Latency = 4; }
+def V3Write_6c_1L    : SchedWriteRes<[V3UnitL]>   { let Latency = 6; }
+def V3Write_2c_1V    : SchedWriteRes<[V3UnitV]>   { let Latency = 2; }
+def V3Write_2c_1V0   : SchedWriteRes<[V3UnitV0]>  { let Latency = 2; }
+def V3Write_3c_1V    : SchedWriteRes<[V3UnitV]>   { let Latency = 3; }
+def V3Write_3c_1V01  : SchedWriteRes<[V3UnitV01]> { let Latency = 3;
+                                                    let ReleaseAtCycles = [2]; }
+def V3Write_4c_1V    : SchedWriteRes<[V3UnitV]>   { let Latency = 4; }
+def V3Write_5c_1V    : SchedWriteRes<[V3UnitV]>   { let Latency = 5; }
+def V3Write_6c_1V    : SchedWriteRes<[V3UnitV]>   { let Latency = 6; }
+def V3Write_12c_1V   : SchedWriteRes<[V3UnitV]>   { let Latency = 12; }
+def V3Write_3c_1V0   : SchedWriteRes<[V3UnitV0]>  { let Latency = 3; }
+def V3Write_3c_1V02  : SchedWriteRes<[V3UnitV02]> { let Latency = 3; }
+def V3Write_4c_1V0   : SchedWriteRes<[V3UnitV0]>  { let Latency = 4; }
+def V3Write_4c_1V02  : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Write_9c_1V0   : SchedWriteRes<[V3UnitV0]>  { let Latency = 9; }
+def V3Write_10c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 10; }
+def V3Write_8c_1V1   : SchedWriteRes<[V3UnitV1]> { let Latency = 8; }
+def V3Write_12c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 12;
+                                                    let ReleaseAtCycles = [11]; }
+def V3Write_13c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 13; }
+def V3Write_15c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 15; }
+def V3Write_13c_1V1  : SchedWriteRes<[V3UnitV1]> { let Latency = 13; }
+def V3Write_16c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 16; }
+def V3Write_16c_1V02 : SchedWriteRes<[V3UnitV02]> { let Latency = 16;
+                                                    let ReleaseAtCycles = [8]; }
+def V3Write_20c_1V0  : SchedWriteRes<[V3UnitV0]>  { let Latency = 20;
+                                                    let ReleaseAtCycles = [20]; }
+def V3Write_2c_1V1   : SchedWriteRes<[V3UnitV1]>  { let Latency = 2; }
+def V3Write_2c_1V13  : SchedWriteRes<[V3UnitV13]> { let Latency = 2; }
+def V3Write_3c_1V1   : SchedWriteRes<[V3UnitV1]>  { let Latency = 3; }
+def V3Write_3c_1V13  : SchedWriteRes<[V3UnitV13]> { let Latency = 3; }
+def V3Write_4c_1V1   : SchedWriteRes<[V3UnitV1]>  { let Latency = 4; }
+def V3Write_6c_1V1   : SchedWriteRes<[V3UnitV1]>  { let Latency = 6; }
+def V3Write_10c_1V1  : SchedWriteRes<[V3UnitV1]>  { let Latency = 10; }
+def V3Write_6c_1SA   : SchedWriteRes<[V3UnitSA]>  { let Latency = 6; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def V3Write_1c_1B_1S : SchedWriteRes<[V3UnitB, V3UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1M0_1B : SchedWriteRes<[V3UnitM0, V3UnitB]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_9c_1M0_1L : SchedWriteRes<[V3UnitM0, V3UnitL]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+
+def V3Write_3c_1I_1M : SchedWriteRes<[V3UnitI, V3UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3Write_1c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3Write_3c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_7c_1I_1L : SchedWriteRes<[V3UnitI, V3UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+
+def V3Write_1c_1SA_1D : SchedWriteRes<[V3UnitSA, V3UnitD]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_1M0_1V : SchedWriteRes<[V3UnitM0, V3UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_2c_1SA_1V01 : SchedWriteRes<[V3UnitSA, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3Write_2c_2V01  : SchedWriteRes<[V3UnitV01, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_1SA_1V01  : SchedWriteRes<[V3UnitSA, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_1V13_1V : SchedWriteRes<[V3UnitV13, V3UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V : SchedWriteRes<[V3UnitV, V3UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_2L : SchedWriteRes<[V3UnitL, V3UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_8c_1L_1V : SchedWriteRes<[V3UnitL, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_1SA_1V : SchedWriteRes<[V3UnitSA, V3UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_3c_1M0_1M  : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3Write_4c_1M0_1M  : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3Write_1c_1M0_1M  : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3Write_2c_1M0_1M  : SchedWriteRes<[V3UnitM0, V3UnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V1 : SchedWriteRes<[V3UnitV1, V3UnitV1]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_2V0 : SchedWriteRes<[V3UnitV0, V3UnitV0]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_2V02 : SchedWriteRes<[V3UnitV02, V3UnitV02]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_5c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1V1_1M0 : SchedWriteRes<[V3UnitV1, V3UnitM0]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_7c_1M0_1V02 : SchedWriteRes<[V3UnitM0, V3UnitV02]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+
+def V3Write_2c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3Write_3c_1V0_1M : SchedWriteRes<[V3UnitV0, V3UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1V_1V13 : SchedWriteRes<[V3UnitV, V3UnitV13]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1L_1M : SchedWriteRes<[V3UnitL, V3UnitM]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_1L_1I : SchedWriteRes<[V3UnitL, V3UnitI]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_6c_2V13 : SchedWriteRes<[V3UnitV13, V3UnitV13]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3Write_8c_1M0_1V01 : SchedWriteRes<[V3UnitM0, V3UnitV01]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def V3Write_1c_1SA_1D_1I : SchedWriteRes<[V3UnitSA, V3UnitD, V3UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_1V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3Write_4c_1SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+
+def V3Write_9c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+def V3Write_4c_3V  : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+
+def V3Write_7c_1M_1M0_1V : SchedWriteRes<[V3UnitM, V3UnitM0, V3UnitV]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+
+def V3Write_2c_1SA_1I_1V01 : SchedWriteRes<[V3UnitSA, V3UnitI, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3Write_6c_3L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+
+def V3Write_6c_3V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+
+def V3Write_8c_1L_2V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def V3Write_2c_1SA_2V01_1I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+                                            V3UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+
+def V3Write_2c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+
+def V3Write_4c_2SA_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+
+def V3Write_5c_1I_3L : SchedWriteRes<[V3UnitI, V3UnitL, V3UnitL, V3UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V0 : SchedWriteRes<[V3UnitV0, V3UnitV0, V3UnitV0, V3UnitV0]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3Write_8c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+                                        V3UnitV13]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3Write_8c_2V_2V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+                                        V3UnitV13]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V02 : SchedWriteRes<[V3UnitV02, V3UnitV02, V3UnitV02,
+                                     V3UnitV02]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_4V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3Write_8c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3Write_9c_2L_2V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+
+def V3Write_2c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV,
+                                       V3UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+
+def V3Write_4c_2SA_2V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV,
+                                       V3UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+
+def V3Write_8c_2M0_2V02 : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitV02,
+                                         V3UnitV02]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3Write_8c_2V_2V1 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV1,
+                                       V3UnitV1]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3Write_4c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM,
+                                       V3UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+
+def V3Write_5c_2M0_2M : SchedWriteRes<[V3UnitM0, V3UnitM0, V3UnitM,
+                                       V3UnitM]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_2I_2L : SchedWriteRes<[V3UnitI, V3UnitI, V3UnitL, V3UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3Write_7c_4L : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+
+def V3Write_6c_1SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+                                         V3UnitV01]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def V3Write_2c_1SA_2V01_2I : SchedWriteRes<[V3UnitSA, V3UnitV01, V3UnitV01,
+                                            V3UnitI, V3UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 5;
+}
+
+def V3Write_8c_2L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV, V3UnitV,
+                                      V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+
+def V3Write_9c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+                                      V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+def V3Write_10c_1L_4V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+                                       V3UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 5;
+}
+
+def V3Write_6c_5V : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV, V3UnitV,
+                                   V3UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def V3Write_8c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 6;
+}
+
+def V3Write_9c_3L_3V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3Write_9c_2L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV,
+                                      V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3Write_9c_2L_2V_2I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitV,
+                                         V3UnitV, V3UnitI, V3UnitI]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3Write_9c_2V_4V13 : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV13,
+                                        V3UnitV13, V3UnitV13, V3UnitV13]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3Write_2c_3SA_3V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                       V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 6;
+}
+
+def V3Write_4c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+
+def V3Write_5c_2SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 5;
+  let NumMicroOps = 6;
+}
+
+def V3Write_2c_3SA_3V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 6;
+}
+
+def V3Write_4c_2SA_2I_2V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitI,
+                                            V3UnitI, V3UnitV01, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def V3Write_8c_3L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 7;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def V3Write_2c_4SA_4V : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                       V3UnitSA, V3UnitV, V3UnitV, V3UnitV,
+                                       V3UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 8;
+}
+
+def V3Write_2c_4SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitSA, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01]> {
+  let Latency     = 2;
+  let NumMicroOps = 8;
+}
+
+def V3Write_6c_2SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01]> {
+  let Latency     = 6;
+  let NumMicroOps = 8;
+}
+
+def V3Write_8c_4L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitV, V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def V3Write_6c_3SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 6;
+  let NumMicroOps = 9;
+}
+
+def V3Write_10c_1L_8V : SchedWriteRes<[V3UnitL, V3UnitV, V3UnitV, V3UnitV,
+                                       V3UnitV, V3UnitV, V3UnitV, V3UnitV,
+                                       V3UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 9;
+}
+
+def V3Write_10c_3V_3L_3I : SchedWriteRes<[V3UnitV, V3UnitV, V3UnitV,
+                                          V3UnitL, V3UnitL, V3UnitL,
+                                          V3UnitI, V3UnitI, V3UnitI]> {
+  let Latency     = 10;
+  let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def V3Write_9c_6L_4V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitL, V3UnitL, V3UnitV, V3UnitV,
+                                      V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def V3Write_5c_4SA_8V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitSA, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 5;
+  let NumMicroOps = 12;
+}
+
+def V3Write_9c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                      V3UnitL, V3UnitV, V3UnitV,
+                                      V3UnitV, V3UnitV, V3UnitV,
+                                      V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 12;
+}
+
+def V3Write_10c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                       V3UnitL, V3UnitV, V3UnitV,
+                                       V3UnitV, V3UnitV, V3UnitV,
+                                       V3UnitV, V3UnitV, V3UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 12;
+}
+
+def V3Write_4c_6SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 4;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def V3Write_7c_4SA_12V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                          V3UnitSA, V3UnitV01, V3UnitV01,
+                                          V3UnitV01, V3UnitV01, V3UnitV01,
+                                          V3UnitV01, V3UnitV01, V3UnitV01,
+                                          V3UnitV01, V3UnitV01, V3UnitV01,
+                                          V3UnitV01]> {
+  let Latency     = 7;
+  let NumMicroOps = 16;
+}
+
+def V3Write_10c_4L_8V_4I : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
+                                          V3UnitL, V3UnitV, V3UnitV,
+                                          V3UnitV, V3UnitV, V3UnitV,
+                                          V3UnitV, V3UnitV, V3UnitV,
+                                          V3UnitI, V3UnitI, V3UnitI,
+                                          V3UnitI]> {
+  let Latency     = 10;
+  let NumMicroOps = 16;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def V3Write_7c_9SA_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitSA, V3UnitSA, V3UnitSA,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01,
+                                         V3UnitV01, V3UnitV01, V3UnitV01]> {
+  let Latency     = 7;
+  let NumMicroOps = 18;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 27 micro-op types
+
+def V3Write_7c_9SA_9I_9V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitI, V3UnitI, V3UnitI,
+                                            V3UnitI, V3UnitI, V3UnitI,
+                                            V3UnitI, V3UnitI, V3UnitI,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01,
+                                            V3UnitV01]> {
+  let Latency     = 7;
+  let NumMicroOps = 27;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 36 micro-op types
+
+def V3Write_11c_18SA_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitSA, V3UnitSA, V3UnitSA,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01, V3UnitV01,
+                                            V3UnitV01, V3UnitV01,
+                                            V3UnitV01]> {
+  let Latency     = 11;
+  let NumMicroOps = 36;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 54 micro-op types
+
+def V3Write_11c_18SA_18I_18V01 : SchedWriteRes<[V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitSA, V3UnitSA,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitI, V3UnitI, V3UnitI,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01,
+                                                V3UnitV01, V3UnitV01]> {
+  let Latency     = 11;
+  let NumMicroOps = 54;
+}
+
+//===----------------------------------------------------------------------===//
+// Define predicate-controlled types
+
+def V3Write_ArithI : SchedWriteVariant<[
+                       SchedVar<IsCheapLSL,  [V3Write_1c_1I]>,
+                       SchedVar<NoSchedPred, [V3Write_2c_1M]>]>;
+
+def V3Write_ArithF : SchedWriteVariant<[
+                       SchedVar<IsCheapLSL,  [V3Write_1c_1F_1Flg]>,
+                       SchedVar<NoSchedPred, [V3Write_2c_1M_1Flg]>]>;
+
+def V3Write_Logical : SchedWriteVariant<[
+                        SchedVar<NeoverseNoLSL, [V3Write_1c_1F_1Flg]>,
+                        SchedVar<NoSchedPred,   [V3Write_2c_1M_1Flg]>]>;
+
+def V3Write_Extr : SchedWriteVariant<[
+                     SchedVar<IsRORImmIdiomPred, [V3Write_1c_1I]>,
+                     SchedVar<NoSchedPred,       [V3Write_3c_1I_1M]>]>;
+
+def V3Write_LdrHQ : SchedWriteVariant<[
+                      SchedVar<NeoverseHQForm,  [V3Write_7c_1I_1L]>,
+                      SchedVar<NoSchedPred,     [V3Write_6c_1L]>]>;
+
+def V3Write_StrHQ : SchedWriteVariant<[
+                      SchedVar<NeoverseHQForm,  [V3Write_2c_1SA_1V01_1I]>,
+                      SchedVar<NoSchedPred,     [V3Write_2c_1SA_1V01]>]>;
+
+def V3Write_0or1c_1I : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+                      SchedVar<NoSchedPred,      [V3Write_1c_1I]>]>;
+
+def V3Write_0or2c_1V : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+                      SchedVar<NoSchedPred,      [V3Write_2c_1V]>]>;
+
+def V3Write_0or3c_1M0 : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3Write_0c]>,
+                      SchedVar<NoSchedPred,      [V3Write_3c_1M0]>]>;
+
+def V3Write_2or3c_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3Write_3c_1M]>,
+                      SchedVar<NoSchedPred,     [V3Write_2c_1M]>]>;
+
+def V3Write_1or2c_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3Write_2c_1M]>,
+                      SchedVar<NoSchedPred,     [V3Write_1c_1M]>]>;
+
+def V3Write_3or4c_1M0_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3Write_4c_1M0_1M]>,
+                      SchedVar<NoSchedPred,     [V3Write_3c_1M0_1M]>]>;
+
+def V3Write_2or3c_1V0 : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3Write_3c_1V0]>,
+                      SchedVar<NoSchedPred,     [V3Write_2c_1V0]>]>;
+
+def V3Write_2or3c_1V0_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3Write_3c_1V0_1M]>,
+                      SchedVar<NoSchedPred,     [V3Write_2c_1V0_1M]>]>;
+
+def V3Write_IncDec : SchedWriteVariant<[
+                      SchedVar<NeoverseCheapIncDec, [V3Write_1c_1I]>,
+                      SchedVar<NoSchedPred,         [V3Write_2c_1M]>]>;
+
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V3Wr_IM   : SchedWriteRes<[V3UnitM]>  { let Latency = 2; }
+
+def V3Wr_FMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_FMA : SchedReadAdvance<2, [WriteFMul, V3Wr_FMA]>;
+
+def V3Wr_VA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VA : SchedReadAdvance<3, [V3Wr_VA]>;
+
+def V3Wr_VDOT : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_VDOT : SchedReadAdvance<2, [V3Wr_VDOT]>;
+
+def V3Wr_VMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_VMMA : SchedReadAdvance<2, [V3Wr_VMMA]>;
+
+def V3Wr_VMA : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMA : SchedReadAdvance<3, [V3Wr_VMA]>;
+
+def V3Wr_VMAH : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMAH : SchedReadAdvance<2, [V3Wr_VMAH]>;
+
+def V3Wr_VMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_VMAL : SchedReadAdvance<3, [V3Wr_VMAL]>;
+
+def V3Wr_VPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VPA : SchedReadAdvance<3, [V3Wr_VPA]>;
+
+def V3Wr_VSA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VSA : SchedReadAdvance<3, [V3Wr_VSA]>;
+
+def V3Wr_VFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFCMA : SchedReadAdvance<2, [V3Wr_VFCMA]>;
+
+def V3Wr_VFM  : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Wr_VFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFMA : SchedReadAdvance<2, [V3Wr_VFM, V3Wr_VFMA]>;
+
+def V3Wr_VFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_VFMAL : SchedReadAdvance<2, [V3Wr_VFMAL]>;
+
+def V3Wr_VBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_VBFDOT : SchedReadAdvance<2, [V3Wr_VBFDOT]>;
+def V3Wr_VBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; }
+def V3Rd_VBFMMA : SchedReadAdvance<2, [V3Wr_VBFMMA]>;
+def V3Wr_VBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_VBFMAL : SchedReadAdvance<3, [V3Wr_VBFMAL]>;
+
+def V3Wr_CRC : SchedWriteRes<[V3UnitM0]> { let Latency = 2; }
+def V3Rd_CRC : SchedReadAdvance<1, [V3Wr_CRC]>;
+
+def V3Wr_ZA  : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZA  : SchedReadAdvance<3, [V3Wr_ZA]>;
+def V3Wr_ZPA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZPA : SchedReadAdvance<3, [V3Wr_ZPA]>;
+def V3Wr_ZSA : SchedWriteRes<[V3UnitV13]> { let Latency = 4; }
+def V3Rd_ZSA : SchedReadAdvance<3, [V3Wr_ZSA]>;
+
+def V3Wr_ZDOTB : SchedWriteRes<[V3UnitV]>   { let Latency = 3; }
+def V3Rd_ZDOTB : SchedReadAdvance<2, [V3Wr_ZDOTB]>;
+def V3Wr_ZDOTH : SchedWriteRes<[V3UnitV02]> { let Latency = 3; }
+def V3Rd_ZDOTH : SchedReadAdvance<2, [V3Wr_ZDOTH]>;
+
+// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
+// throughput to 1 in case of forwarding?
+def V3Wr_ZCMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZCMABHS : SchedReadAdvance<3, [V3Wr_ZCMABHS]>;
+def V3Wr_ZCMAD   : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZCMAD   : SchedReadAdvance<2, [V3Wr_ZCMAD]>;
+
+def V3Wr_ZMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
+def V3Rd_ZMMA : SchedReadAdvance<2, [V3Wr_ZMMA]>;
+
+def V3Wr_ZMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZMABHS : SchedReadAdvance<3, [V3Wr_ZMABHS]>;
+def V3Wr_ZMAD  : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZMAD  : SchedReadAdvance<2, [V3Wr_ZMAD]>;
+
+def V3Wr_ZMAL : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
+def V3Rd_ZMAL : SchedReadAdvance<3, [V3Wr_ZMAL]>;
+
+def V3Wr_ZMASQL   : SchedWriteRes<[V3UnitV02]>            { let Latency = 4; }
+def V3Wr_ZMASQBHS : SchedWriteRes<[V3UnitV02]>            { let Latency = 4; }
+def V3Wr_ZMASQD   : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
+def V3Rd_ZMASQ    : SchedReadAdvance<2, [V3Wr_ZMASQL, V3Wr_ZMASQBHS,
+                                         V3Wr_ZMASQD]>;
+
+def V3Wr_ZFCMA : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZFCMA : SchedReadAdvance<3, [V3Wr_ZFCMA]>;
+
+def V3Wr_ZFMA : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZFMA : SchedReadAdvance<2, [V3Wr_ZFMA]>;
+
+def V3Wr_ZFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 4; }
+def V3Rd_ZFMAL : SchedReadAdvance<2, [V3Wr_ZFMAL]>;
+
+def V3Wr_ZBFDOT : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZBFDOT : SchedReadAdvance<2, [V3Wr_ZBFDOT]>;
+def V3Wr_ZBFMMA : SchedWriteRes<[V3UnitV]> { let Latency = 6; }
+def V3Rd_ZBFMMA : SchedReadAdvance<2, [V3Wr_ZBFMMA]>;
+def V3Wr_ZBFMAL : SchedWriteRes<[V3UnitV]> { let Latency = 5; }
+def V3Rd_ZBFMAL : SchedReadAdvance<3, [V3Wr_ZBFMAL]>;
+
+//===----------------------------------------------------------------------===//
+// Define types with long resource cycles (rc)
+
+def V3Write_6c_1V1_5rc    : SchedWriteRes<[V3UnitV1]>  { let Latency =  6; let ReleaseAtCycles = [ 5]; }
+def V3Write_9c_1V1_2rc    : SchedWriteRes<[V3UnitV1]>  { let Latency =  9; let ReleaseAtCycles = [ 2]; }
+def V3Write_9c_1V1_4rc    : SchedWriteRes<[V3UnitV1]>  { let Latency =  9; let ReleaseAtCycles = [ 4]; }
+def V3Write_10c_1V1_9rc   : SchedWriteRes<[V3UnitV1]>  { let Latency = 10; let ReleaseAtCycles = [ 9]; }
+def V3Write_11c_1V1_4rc  : SchedWriteRes<[V3UnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; }
+def V3Write_13c_1V1_8rc : SchedWriteRes<[V3UnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; }
+def V3Write_14c_1V1_2rc : SchedWriteRes<[V3UnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; }
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// §3.3 Branch instructions
+// -----------------------------------------------------------------------------
+
+// Branch, immed
+// Compare and branch
+def : SchedAlias<WriteBr,    V3Write_1c_1B>;
+
+// Branch, register
+def : SchedAlias<WriteBrReg, V3Write_1c_1B>;
+
+// Branch and link, immed
+// Branch and link, register
+def : InstRW<[V3Write_1c_1B_1S], (instrs BL, BLR)>;
+
+// §3.4 Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+// ALU, basic
+def : SchedAlias<WriteI, V3Write_1c_1I>;
+
+// ALU, basic, flagset
+def : InstRW<[V3Write_1c_1F_1Flg],
+             (instregex "^(ADD|SUB)S[WX]r[ir]$",
+                        "^(ADC|SBC)S[WX]r$",
+                        "^ANDS[WX]ri$",
+                        "^(AND|BIC)S[WX]rr$")>;
+def : InstRW<[V3Write_0or1c_1I], (instregex "^MOVZ[WX]i$")>;
+
+// ALU, extend and shift
+def : SchedAlias<WriteIEReg, V3Write_2c_1M>;
+
+// Arithmetic, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
+def : SchedAlias<WriteISReg, V3Write_ArithI>;
+def : InstRW<[V3Write_ArithF],
+             (instregex "^(ADD|SUB)S[WX]rs$")>;
+
+// Arithmetic, immediate to logical address tag
+def : InstRW<[V3Write_2c_1M], (instrs ADDG, SUBG)>;
+
+// Conditional compare
+def : InstRW<[V3Write_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>;
+
+// Convert floating-point condition flags
+// Flag manipulation instructions
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+// Insert Random Tags
+def : InstRW<[V3Write_2c_1M], (instrs IRG, IRGstack)>;
+
+// Insert Tag Mask
+// Subtract Pointer
+def : InstRW<[V3Write_1c_1I], (instrs GMI, SUBP)>;
+
+// Subtract Pointer, flagset
+def : InstRW<[V3Write_1c_1F_1Flg], (instrs SUBPS)>;
+
+// Logical, shift, no flagset
+def : InstRW<[V3Write_1c_1I],    (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
+def : InstRW<[V3Write_0or1c_1I], (instregex "^ORR[WX]rs$")>;
+
+// Logical, shift, flagset
+def : InstRW<[V3Write_Logical], (instregex "^(AND|BIC)S[WX]rs$")>;
+
+// Move and shift instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteImm, V3Write_1c_1I>;
+
+// §3.5 Divide and multiply instructions
+// -----------------------------------------------------------------------------
+
+// SDIV, UDIV
+def : SchedAlias<WriteID32,  V3Write_12c_1M0>;
+def : SchedAlias<WriteID64,  V3Write_20c_1M0>;
+
+def : SchedAlias<WriteIM32, V3Write_2c_1M>;
+def : SchedAlias<WriteIM64, V3Write_2c_1M>;
+
+// Multiply
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+def : InstRW<[V3Wr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V3Wr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
+// Multiply high
+def : InstRW<[V3Write_3c_1M], (instrs SMULHrr, UMULHrr)>;
+
+// §3.6 Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[V3Write_4c_1M0], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[V3Write_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+                                            BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+                                            ERETAA, ERETAB)>;
+
+
+// Load register, with pointer authentication
+def : InstRW<[V3Write_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[V3Write_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>;
+
+// §3.7 Miscellaneous data-processing instructions
+// -----------------------------------------------------------------------------
+
+// Address generation
+def : InstRW<[V3Write_1c_1I], (instrs ADR, ADRP)>;
+
+// Bitfield extract, one reg
+// Bitfield extract, two regs
+def : SchedAlias<WriteExtr, V3Write_Extr>;
+def : InstRW<[V3Write_Extr], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitfield move, basic
+def : SchedAlias<WriteIS, V3Write_1c_1I>;
+
+// Bitfield move, insert
+def : InstRW<[V3Write_2c_1M], (instregex "^BFM[WX]ri$")>;
+
+// §3.8 Load instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3.
+
+def : SchedAlias<WriteLD,    V3Write_4c_1L>;
+def : SchedAlias<WriteLDIdx, V3Write_4c_1L>;
+
+// Load register, literal
+def : InstRW<[V3Write_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
+
+// Load pair, signed immed offset, signed words
+def : InstRW<[V3Write_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>;
+
+// Load pair, immed post-index or immed pre-index, signed words
+def : InstRW<[WriteAdr, V3Write_5c_1I_3L, WriteLDHi],
+             (instregex "^LDPSW(post|pre)$")>;
+
+// §3.9 Store instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I.
+
+def : SchedAlias<WriteST,    V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteSTIdx, V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteSTP,   V3Write_1c_1SA_1D>;
+def : SchedAlias<WriteAdr,   V3Write_1c_1I>;
+
+// §3.10 Tag load instructions
+// -----------------------------------------------------------------------------
+
+// Load allocation tag
+// Load multiple allocation tags
+def : InstRW<[V3Write_4c_1L], (instrs LDG, LDGM)>;
+
+// §3.11 Tag store instructions
+// -----------------------------------------------------------------------------
+
+// Store allocation tags to one or two granules, post-index
+// Store allocation tags to one or two granules, pre-index
+// Store allocation tag to one or two granules, zeroing, post-index
+// Store Allocation Tag to one or two granules, zeroing, pre-index
+// Store allocation tag and reg pair to memory, post-Index
+// Store allocation tag and reg pair to memory, pre-Index
+def : InstRW<[V3Write_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex,
+                                                ST2GPreIndex, ST2GPostIndex,
+                                                STZGPreIndex, STZGPostIndex,
+                                                STZ2GPreIndex, STZ2GPostIndex,
+                                                STGPpre, STGPpost)>;
+
+// Store allocation tags to one or two granules, signed offset
+// Store allocation tag to two granules, zeroing, signed offset
+// Store allocation tag and reg pair to memory, signed offset
+// Store multiple allocation tags
+def : InstRW<[V3Write_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi,
+                                             STZ2Gi, STGPi, STGM, STZGM)>;
+
+// §3.12 FP data processing instructions
+// -----------------------------------------------------------------------------
+
+// FP absolute value
+// FP arithmetic
+// FP min/max
+// FP negate
+// FP select
+def : SchedAlias<WriteF,     V3Write_2c_1V>;
+
+// FP compare
+def : SchedAlias<WriteFCmp,  V3Write_2c_1V0>;
+
+// FP divide, square root
+def : SchedAlias<WriteFDiv,  V3Write_6c_1V1>;
+
+// FP divide, H-form
+def : InstRW<[V3Write_6c_1V1],  (instrs FDIVHrr)>;
+// FP divide, S-form
+def : InstRW<[V3Write_8c_1V1], (instrs FDIVSrr)>;
+// FP divide, D-form
+def : InstRW<[V3Write_13c_1V1], (instrs FDIVDrr)>;
+
+// FP square root, H-form
+def : InstRW<[V3Write_6c_1V1],  (instrs FSQRTHr)>;
+// FP square root, S-form
+def : InstRW<[V3Write_8c_1V1],  (instrs FSQRTSr)>;
+// FP square root, D-form
+def : InstRW<[V3Write_13c_1V1], (instrs FSQRTDr)>;
+
+// FP multiply
+def : WriteRes<WriteFMul, [V3UnitV]> { let Latency = 3; }
+
+// FP multiply accumulate
+def : InstRW<[V3Wr_FMA, ReadDefault, ReadDefault, V3Rd_FMA],
+             (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+
+// FP round to integral
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
+                                             "^FRINT(32|64)[XZ][SD]r$")>;
+
+// §3.13 FP miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// FP convert, from gen to vec reg
+def : InstRW<[V3Write_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
+
+// FP convert, from vec to gen reg
+def : InstRW<[V3Write_3c_1V01],
+             (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>;
+
+// FP convert, Javascript from vec to gen reg
+def : SchedAlias<WriteFCvt, V3Write_3c_1V0>;
+
+// FP convert, from vec to vec reg
+def : InstRW<[V3Write_3c_1V02], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr,
+                                          FCVTHDr, FCVTSDr, FCVTXNv1i64)>;
+
+// FP move, immed
+// FP move, register
+def : SchedAlias<WriteFImm, V3Write_2c_1V>;
+
+// FP transfer, from gen to low half of vec reg
+def : InstRW<[V3Write_0or3c_1M0],
+             (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
+
+// FP transfer, from gen to high half of vec reg
+def : InstRW<[V3Write_5c_1M0_1V], (instrs FMOVXDHighr)>;
+
+// FP transfer, from vec to gen reg
+def : SchedAlias<WriteFCopy, V3Write_2c_2V01>;
+
+// §3.14 FP load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector reg, literal, S/D/Q forms
+def : InstRW<[V3Write_7c_1I_1L], (instregex "^LDR[SDQ]l$")>;
+
+// Load vector reg, unscaled immed
+def : InstRW<[V3Write_6c_1L], (instregex "^LDUR[BHSDQ]i$")>;
+
+// Load vector reg, immed post-index
+// Load vector reg, immed pre-index
+def : InstRW<[WriteAdr, V3Write_6c_1I_1L],
+             (instregex "^LDR[BHSDQ](pre|post)$")>;
+
+// Load vector reg, unsigned immed
+def : InstRW<[V3Write_6c_1L], (instregex "^LDR[BHSDQ]ui$")>;
+
+// Load vector reg, register offset, basic
+// Load vector reg, register offset, scale, S/D-form
+// Load vector reg, register offset, scale, H/Q-form
+// Load vector reg, register offset, extend
+// Load vector reg, register offset, extend, scale, S/D-form
+// Load vector reg, register offset, extend, scale, H/Q-form
+def : InstRW<[V3Write_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>;
+
+// Load vector pair, immed offset, S/D-form
+def : InstRW<[V3Write_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
+
+// Load vector pair, immed offset, Q-form
+def : InstRW<[V3Write_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
+
+// Load vector pair, immed post-index, S/D-form
+// Load vector pair, immed pre-index, S/D-form
+def : InstRW<[WriteAdr, V3Write_6c_1I_1L, WriteLDHi],
+             (instregex "^LDP[SD](pre|post)$")>;
+
+// Load vector pair, immed post-index, Q-form
+// Load vector pair, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3Write_6c_2I_2L, WriteLDHi], (instrs LDPQpost,
+                                                                LDPQpre)>;
+
+// §3.15 FP store instructions
+// -----------------------------------------------------------------------------
+
+// Store vector reg, unscaled immed, B/H/S/D-form
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STUR[BHSDQ]i$")>;
+
+// Store vector reg, immed post-index, B/H/S/D-form
+// Store vector reg, immed post-index, Q-form
+// Store vector reg, immed pre-index, B/H/S/D-form
+// Store vector reg, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I],
+             (instregex "^STR[BHSDQ](pre|post)$")>;
+
+// Store vector reg, unsigned immed, B/H/S/D-form
+// Store vector reg, unsigned immed, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STR[BHSDQ]ui$")>;
+
+// Store vector reg, register offset, basic, B/H/S/D-form
+// Store vector reg, register offset, basic, Q-form
+// Store vector reg, register offset, scale, H-form
+// Store vector reg, register offset, scale, S/D-form
+// Store vector reg, register offset, scale, Q-form
+// Store vector reg, register offset, extend, B/H/S/D-form
+// Store vector reg, register offset, extend, Q-form
+// Store vector reg, register offset, extend, scale, H-form
+// Store vector reg, register offset, extend, scale, S/D-form
+// Store vector reg, register offset, extend, scale, Q-form
+def : InstRW<[V3Write_StrHQ, ReadAdrBase],
+             (instregex "^STR[BHSDQ]ro[WX]$")>;
+
+// Store vector pair, immed offset, S-form
+// Store vector pair, immed offset, D-form
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STN?P[SD]i$")>;
+
+// Store vector pair, immed offset, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01], (instrs STPQi, STNPQi)>;
+
+// Store vector pair, immed post-index, S-form
+// Store vector pair, immed post-index, D-form
+// Store vector pair, immed pre-index, S-form
+// Store vector pair, immed pre-index, D-form
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01_1I],
+             (instregex "^STP[SD](pre|post)$")>;
+
+// Store vector pair, immed post-index, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01_1I], (instrs STPQpost)>;
+
+// Store vector pair, immed pre-index, Q-form
+def : InstRW<[V3Write_2c_1SA_2V01_2I], (instrs STPQpre)>;
+
+// §3.16 ASIMD integer instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD absolute diff
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD arith, pair-wise
+// ASIMD compare
+// ASIMD logical
+// ASIMD max/min, basic and pair-wise
+def : SchedAlias<WriteVd, V3Write_2c_1V>;
+def : SchedAlias<WriteVq, V3Write_2c_1V>;
+
+// ASIMD absolute diff accum
+// ASIMD absolute diff accum long
+def : InstRW<[V3Wr_VA, V3Rd_VA], (instregex "^[SU]ABAL?v")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[V3Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[V3Write_5c_1V13_1V],
+             (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
+
+// ASIMD arith, reduce, 16B
+def : InstRW<[V3Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+
+// ASIMD dot product
+// ASIMD dot product using signed and unsigned integers
+def : InstRW<[V3Wr_VDOT, V3Rd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V3Wr_VMMA, V3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[V3Write_3c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+                                           "^[SU](MAX|MIN)Vv4i32v$")>;
+
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[V3Write_5c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+                                              "^[SU](MAX|MIN)Vv8i16v$")>;
+
+// ASIMD max/min, reduce, 16B
+def : InstRW<[V3Write_6c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+
+// ASIMD multiply
+def : InstRW<[V3Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
+// ASIMD multiply accumulate
+def : InstRW<[V3Wr_VMA, V3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
+// ASIMD multiply accumulate high
+def : InstRW<[V3Wr_VMAH, V3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3Wr_VMAL, V3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
+
+// ASIMD multiply/multiply long (8x8) polynomial, D-form
+// ASIMD multiply/multiply long (8x8) polynomial, Q-form
+def : InstRW<[V3Write_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>;
+
+// ASIMD multiply long
+def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>;
+
+// ASIMD pairwise add and accumulate long
+def : InstRW<[V3Wr_VPA, V3Rd_VPA], (instregex "^[SU]ADALPv")>;
+
+// ASIMD shift accumulate
+def : InstRW<[V3Wr_VSA, V3Rd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv",
+                                         "^SSHLLv", "^SSHR[dv]", "^USHLLv",
+                                         "^USHR[dv]")>;
+
+// ASIMD shift by immed and insert, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[V3Write_4c_1V],
+             (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$",
+                        "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
+                        "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]",
+                        "^UQSHRN[bhsv]", "^URSHR[dv]")>;
+
+// ASIMD shift by register, basic
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]SHLv")>;
+
+// ASIMD shift by register, complex
+def : InstRW<[V3Write_4c_1V],
+             (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
+                        "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
+
+// §3.17 ASIMD floating-point instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD FP absolute value/difference
+// ASIMD FP arith, normal
+// ASIMD FP compare
+// ASIMD FP complex add
+// ASIMD FP max/min, normal
+// ASIMD FP max/min, pairwise
+// ASIMD FP negate
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD FP complex multiply add
+def : InstRW<[V3Wr_VFCMA, V3Rd_VFCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP convert, long (F16 to F32)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTL(v4|v8)i16")>;
+
+// ASIMD FP convert, long (F32 to F64)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTL(v2|v4)i32")>;
+
+// ASIMD FP convert, narrow (F32 to F16)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTN(v4|v8)i16")>;
+
+// ASIMD FP convert, narrow (F64 to F32)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVTN(v2|v4)i32",
+                                             "^FCVTXN(v2|v4)f32")>;
+
+// ASIMD FP convert, other, D-form F32 and Q-form F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
+                                           "^FCVT[AMNPZ][SU]v2i(32|64)_shift$",
+                                           "^FCVT[AMNPZ][SU]v1i64$",
+                                           "^FCVTZ[SU]d$",
+                                           "^[SU]CVTFv2f(32|64)$",
+                                           "^[SU]CVTFv2i(32|64)_shift$",
+                                           "^[SU]CVTFv1i64$",
+                                           "^[SU]CVTFd$")>;
+
+// ASIMD FP convert, other, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
+                                           "^FCVT[AMNPZ][SU]v4i(16|32)_shift$",
+                                           "^FCVT[AMNPZ][SU]v1i32$",
+                                           "^FCVTZ[SU]s$",
+                                           "^[SU]CVTFv4f(16|32)$",
+                                           "^[SU]CVTFv4i(16|32)_shift$",
+                                           "^[SU]CVTFv1i32$",
+                                           "^[SU]CVTFs$")>;
+
+// ASIMD FP convert, other, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$",
+                                           "^FCVT[AMNPZ][SU]v8i16_shift$",
+                                           "^FCVT[AMNPZ][SU]v1f16$",
+                                           "^FCVTZ[SU]h$",
+                                           "^[SU]CVTFv8f16$",
+                                           "^[SU]CVTFv8i16_shift$",
+                                           "^[SU]CVTFv1i16$",
+                                           "^[SU]CVTFh$")>;
+
+// ASIMD FP divide, D-form, F16
+def : InstRW<[V3Write_9c_1V1_4rc], (instrs FDIVv4f16)>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[V3Write_9c_1V1_2rc], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instrs FDIVv8f16)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, reduce, F32 and D-form F16
+def : InstRW<[V3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
+
+// ASIMD FP max/min, reduce, Q-form F16
+def : InstRW<[V3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
+
+// ASIMD FP multiply
+def : InstRW<[V3Wr_VFM], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[V3Wr_VFMA, V3Rd_VFMA], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V3Wr_VFMAL, V3Rd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>;
+
+// ASIMD FP round, D-form F32 and Q-form F64
+def : InstRW<[V3Write_3c_1V02],
+             (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
+                        "^FRINT(32|64)[XZ]v2f(32|64)$")>;
+
+// ASIMD FP round, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02],
+             (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
+                        "^FRINT(32|64)[XZ]v4f32$")>;
+
+// ASIMD FP round, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
+
+// ASIMD FP square root, D-form, F16
+def : InstRW<[V3Write_9c_1V1_4rc], (instrs FSQRTv4f16)>;
+
+// ASIMD FP square root, D-form, F32
+def : InstRW<[V3Write_9c_1V1_2rc], (instrs FSQRTv2f32)>;
+
+// ASIMD FP square root, Q-form, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instrs FSQRTv8f16)>;
+
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instrs FSQRTv4f32)>;
+
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instrs FSQRTv2f64)>;
+
+// §3.18 ASIMD BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD convert, F32 to BF16
+def : InstRW<[V3Write_4c_2V02], (instrs BFCVTN, BFCVTN2)>;
+
+// ASIMD dot product
+def : InstRW<[V3Wr_VBFDOT, V3Rd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+
+// ASIMD matrix multiply accumulate
+def : InstRW<[V3Wr_VBFMMA, V3Rd_VBFMMA], (instrs BFMMLA)>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3Wr_VBFMAL, V3Rd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
+                                                 BFMLALTIdx)>;
+
+// Scalar convert, F32 to BF16
+def : InstRW<[V3Write_3c_1V02], (instrs BFCVT)>;
+
+// §3.19 ASIMD miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD bit reverse
+// ASIMD bitwise insert
+// ASIMD count
+// ASIMD duplicate, element
+// ASIMD extract
+// ASIMD extract narrow
+// ASIMD insert, element to element
+// ASIMD move, FP immed
+// ASIMD move, integer immed
+// ASIMD reverse
+// ASIMD table lookup extension, 1 table reg
+// ASIMD transpose
+// ASIMD unzip/zip
+// Handled by SchedAlias<WriteV[dq], ...>
+def : InstRW<[V3Write_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>;
+
+// ASIMD duplicate, gen reg
+def : InstRW<[V3Write_3c_1M0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
+
+// ASIMD reciprocal and square root estimate, D-form U32
+def : InstRW<[V3Write_3c_1V02], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form U32
+def : InstRW<[V3Write_4c_2V02], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
+def : InstRW<[V3Write_3c_1V02], (instrs FRECPEv1f16, FRECPEv1i32,
+                                        FRECPEv1i64, FRECPEv2f32,
+                                        FRSQRTEv1f16, FRSQRTEv1i32,
+                                        FRSQRTEv1i64, FRSQRTEv2f32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
+def : InstRW<[V3Write_4c_2V02], (instrs FRECPEv4f16, FRECPEv4f32,
+                                        FRSQRTEv4f16, FRSQRTEv4f32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form F16
+def : InstRW<[V3Write_6c_4V02], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
+
+// ASIMD reciprocal exponent
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRECPXv")>;
+
+// ASIMD reciprocal step
+def : InstRW<[V3Write_4c_1V], (instregex "^FRECPS(32|64|v)",
+                                         "^FRSQRTS(32|64|v)")>;
+
+// ASIMD table lookup, 1 or 2 table regs
+def : InstRW<[V3Write_2c_1V], (instrs TBLv8i8One, TBLv16i8One,
+                                      TBLv8i8Two, TBLv16i8Two)>;
+
+// ASIMD table lookup, 3 table regs
+def : InstRW<[V3Write_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
+
+// ASIMD table lookup, 4 table regs
+def : InstRW<[V3Write_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>;
+
+// ASIMD table lookup extension, 2 table reg
+def : InstRW<[V3Write_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// ASIMD table lookup extension, 3 table reg
+def : InstRW<[V3Write_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>;
+
+// ASIMD table lookup extension, 4 table reg
+def : InstRW<[V3Write_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[V3Write_2c_2V01], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[V3Write_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
+
+// §3.20 ASIMD load instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1L],
+             (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1L],
+             (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2L],
+             (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3Write_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2L],
+             (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3L],
+             (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3Write_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3L],
+             (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4L],
+             (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3Write_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4L],
+             (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[V3Write_8c_1L_1V],           (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_1L_1V],           (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[V3Write_8c_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_1L_2V],           (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 2 element, multiple, Q-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[V3Write_8c_2L_2V],           (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[V3Write_8c_1L_2V],           (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_1L_2V],            (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V],  (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[V3Write_8c_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_2L_3V],           (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[V3Write_8c_3L_3V],           (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lane, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[V3Write_8c_2L_3V],           (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_2L_3V],           (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[V3Write_8c_3L_3V],           (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_8c_3L_4V],           (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[V3Write_9c_6L_4V],           (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[V3Write_8c_3L_4V],           (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+def : InstRW<[V3Write_8c_3L_4V],           (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[V3Write_8c_4L_4V],           (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+// §3.21 ASIMD store instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3Write_2c_1SA_1V01],           (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3Write_2c_1SA_1V01],           (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3Write_2c_1SA_1V01],           (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_1SA_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3Write_2c_2SA_2V01],           (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3Write_2c_2SA_2V01],           (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3Write_2c_3SA_3V01],           (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_3SA_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3Write_2c_2SA_2V01],           (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_2SA_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3Write_2c_4SA_4V01],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_2c_4SA_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[V3Write_4c_1SA_2V01],           (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_4c_1SA_2V01],           (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[V3Write_4c_2SA_4V01],           (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[V3Write_4c_1SA_2V01],           (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_1SA_2V01], (instregex "ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_5c_2SA_4V01],           (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[V3Write_6c_3SA_6V01],           (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_3SA_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[V3Write_5c_2SA_4V01],           (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_2SA_4V01], (instregex "ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3Write_6c_2SA_6V01],           (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_2SA_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+def : InstRW<[V3Write_7c_4SA_12V01],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[WriteAdr, V3Write_7c_4SA_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[V3Write_5c_4SA_8V01],           (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[WriteAdr, V3Write_5c_4SA_8V01], (instregex "ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H/S
+def : InstRW<[V3Write_6c_1SA_3V01],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, V3Write_6c_1SA_3V01], (instregex "ST4i(8|16|32)_POST$")>;
+
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[V3Write_4c_2SA_4V01],            (instregex "ST4i(64)$")>;
+def : InstRW<[WriteAdr, V3Write_4c_2SA_4V01],  (instregex "ST4i(64)_POST$")>;
+
+// §3.22 Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[V3Write_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[V3Write_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3Write_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>;
+
+// Crypto SM3 ops
+def : InstRW<[V3Write_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+                                          "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
+
+// §3.23 CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[V3Wr_CRC, V3Rd_CRC], (instregex "^CRC32")>;
+
+// §3.24 SVE Predicate instructions
+// -----------------------------------------------------------------------------
+
+// Loop control, based on predicate
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP,
+                                         BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP,
+                                         BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[V3Write_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
+                                         BRKPBS_PPzPP)>;
+
+// Loop control, based on GPR
+def : InstRW<[V3Write_3c_2M],
+             (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
+def : InstRW<[V3Write_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+
+// Loop terminate
+def : InstRW<[V3Write_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
+
+// Predicate counting scalar
+def : InstRW<[V3Write_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+def : InstRW<[V3Write_2c_1M],
+             (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI",
+                        "^SQ(DEC|INC)[BHWD]_XPiWdI",
+                        "^UQ(DEC|INC)[BHWD]_WPiI")>;
+
+// Predicate counting scalar, ALL, {1,2,4}
+def : InstRW<[V3Write_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[V3Write_2c_1M],
+             (instregex "^CNTP_XPP_[BHSD]",
+                        "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
+                        "^(UQDEC|UQINC)P_WP_[BHSD]",
+                        "^(SQDEC|SQINC)P_XPWd_[BHSD]")>;
+
+// Predicate counting vector, active predicate
+def : InstRW<[V3Write_7c_1M_1M0_1V],
+             (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
+
+// Predicate logical
+def : InstRW<[V3Write_1or2c_1M],
+             (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
+
+// Predicate logical, flag setting
+def : InstRW<[V3Write_1or2c_1M],
+             (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
+
+// Predicate reverse
+def : InstRW<[V3Write_2c_1M], (instregex "^REV_PP_[BHSD]")>;
+
+// Predicate select
+def : InstRW<[V3Write_1c_1M], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[V3Write_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[V3Write_2c_1M], (instregex "^PTRUES_[BHSD]")>;
+
+// Predicate find first/next
+def : InstRW<[V3Write_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+
+// Predicate test
+def : InstRW<[V3Write_1c_1M], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[V3Write_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>;
+
+// Predicate unpack and widen
+def : InstRW<[V3Write_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[V3Write_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>;
+
+// §3.25 SVE integer instructions
+// -----------------------------------------------------------------------------
+
+// Arithmetic, absolute diff
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
+                                         "^[SU]ABD_ZPZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[V3Wr_ZA, V3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, basic
+def : InstRW<[V3Write_2c_1V],
+             (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^(ADD|SUB)_ZZZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZI_[BHSD]",
+                        "^ADR_[SU]XTW_ZZZ_D_[0123]",
+                        "^ADR_LSL_ZZZ_[SD]_[0123]",
+                        "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+                        "^SADDLBT_ZZZ_[HSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
+
+// Arithmetic, complex
+def : InstRW<[V3Write_2c_1V],
+             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
+                        "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
+                        "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
+                        "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, large integer
+def : InstRW<[V3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[V3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[V3Wr_ZPA, ReadDefault, V3Rd_ZPA],
+             (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
+
+// Arithmetic, shift
+def : InstRW<[V3Write_2c_1V13],
+             (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
+                        "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
+                        "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZZI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
+                        "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[V3Wr_ZSA, V3Rd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>;
+
+// Arithmetic, shift by immediate
+def : InstRW<[V3Write_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]",
+                                         "^[SU]SHLL[BT]_ZZI_[HSD]")>;
+
+// Arithmetic, shift by immediate and insert
+def : InstRW<[V3Write_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>;
+
+// Arithmetic, shift complex
+def : InstRW<[V3Write_4c_1V],
+             (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
+                        "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]",
+                        "^[SU]QR?SHL_ZPZZ_[BHSD]",
+                        "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
+                        "^SQSHRU?N[BT]_ZZI_[BHS]",
+                        "^UQR?SHRN[BT]_ZZI_[BHS]")>;
+
+// Arithmetic, shift right for divide
+def : InstRW<[V3Write_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]",
+                                           "^[SU]RSHL_ZPZZ_[BHSD]",
+                                           "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Bit manipulation
+def : InstRW<[V3Write_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>;
+
+// Bitwise select
+def : InstRW<[V3Write_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
+
+// Count/reverse bits
+def : InstRW<[V3Write_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
+
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[V3Write_2c_1V], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[V3Write_2or3c_1V0],
+             (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
+                        "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
+
+// Complex add
+def : InstRW<[V3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[V3Wr_ZCMABHS, V3Rd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]",
+                                                      "^CMLA_ZZZI_[HS]")>;
+
+// Complex multiply-add D element size
+def : InstRW<[V3Wr_ZCMAD, V3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[V3Write_8c_1M0_1V01], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[V3Write_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
+                                          "^COMPACT_ZPZ_[SD]",
+                                          "^SPLICE_ZPZZ?_[BHSD]")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[V3Write_3c_1V02], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
+                                             "^[SU]CVTF_ZPmZ_StoD")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[V3Write_4c_2V02], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[V3Write_6c_4V02], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
+
+// Copy, scalar
+def : InstRW<[V3Write_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[V3Write_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]",
+                                         "^CPY_ZPzI_[BHSD]")>;
+
+// Divides, 32 bit
+def : InstRW<[V3Write_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
+                                           "^[SU]DIV_ZPZZ_S")>;
+
+// Divides, 64 bit
+def : InstRW<[V3Write_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
+                                           "^[SU]DIV_ZPZZ_D")>;
+
+// Dot product, 8 bit
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[V3Wr_ZDOTB, V3Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[V3Wr_ZDOTH, V3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[V3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]",
+                                         "^DUP_ZZI_[BHSDQ]")>;
+
+// Duplicate, scalar form
+def : InstRW<[V3Write_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>;
+
+// Extend, sign or zero
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
+                                         "^[SU]XTH_ZPmZ_[SD]",
+                                         "^[SU]XTW_ZPmZ_[D]")>;
+
+// Extract
+def : InstRW<[V3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[V3Write_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
+                                         "^SQXTUN[BT]_ZZ_[BHS]")>;
+
+// Extract operation, SIMD and FP scalar form
+def : InstRW<[V3Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>;
+
+// Extract operation, scalar
+def : InstRW<[V3Write_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>;
+
+// Histogram operations
+def : InstRW<[V3Write_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]",
+                                         "^HISTSEG_ZZZ")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[V3Write_4c_1V02], (instregex "^INDEX_II_[BHS]")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[V3Write_7c_1M0_1V02], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[V3Write_5c_2V02], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[V3Write_8c_2M0_2V02], (instregex "^INDEX_(IR|RI|RR)_D")>;
+
+// insert operation, SIMD and FP scalar form
+def : InstRW<[V3Write_2c_1V], (instregex "^INSR_ZV_[BHSD]")>;
+
+// insert operation, scalar
+def : InstRW<[V3Write_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>;
+
+// Logical
+def : InstRW<[V3Write_2c_1V],
+             (instregex "^(AND|EOR|ORR)_ZI",
+                        "^(AND|BIC|EOR|ORR)_ZZZ",
+                        "^EOR(BT|TB)_ZZZ_[BHSD]",
+                        "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]",
+                        "^NOT_ZPmZ_[BHSD]")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
+                                         "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]",
+                                         "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>;
+
+// Matching operations
+// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the
+// latency for this instruction is 4 cycles.
+def : InstRW<[V3Write_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[V3Wr_ZMMA, V3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[V3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
+                                         "^MOVPRFX_ZZ")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
+                                           "^MUL_ZPZZ_[BHS]",
+                                           "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
+                                           "^[SU]MULH_ZPZZ_[BHS]")>;
+
+// Multiply, D element size
+def : InstRW<[V3Write_5c_2V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
+                                           "^MUL_ZPZZ_D",
+                                           "^[SU]MULH_(ZPmZ|ZZZ)_D",
+                                           "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply long
+def : InstRW<[V3Write_4c_1V02], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
+                                           "^[SU]MULL[BT]_ZZZ_[HSD]")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[V3Wr_ZMABHS, V3Rd_ZMABHS],
+             (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>;
+def : InstRW<[V3Wr_ZMABHS, ReadDefault, V3Rd_ZMABHS],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V3Wr_ZMAD, V3Rd_ZMAD],
+             (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V3Wr_ZMAD, ReadDefault, V3Rd_ZMAD],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
+
+// Multiply accumulate long
+def : InstRW<[V3Wr_ZMAL, V3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
+                                                "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[V3Wr_ZMASQL, V3Rd_ZMASQ],
+             (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]",
+                        "^SQDML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULH_ZZZ_[BHS]",
+                                           "^SQDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[V3Write_5c_2V02], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
+                                           "^SQDMULL[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[V3Wr_ZMASQBHS, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
+                                                     "^SQRDCMLAH_ZZZ_[BHS]",
+                                                     "^SQRDML[AS]H_ZZZI_[HS]",
+                                                     "^SQRDCMLAH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[V3Wr_ZMASQD, V3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D",
+                                                   "^SQRDCMLAH_ZZZ_D")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[V3Write_4c_1V02], (instregex "^SQRDMULH_ZZZ_[BHS]",
+                                           "^SQRDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[V3Write_5c_2V02], (instregex "^SQRDMULH_ZZZI?_D")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[V3Write_2c_1V], (instregex "^PMUL_ZZZ_B",
+                                           "^PMULL[BT]_ZZZ_[HDQ]")>;
+
+// Predicate counting vector
+def : InstRW<[V3Write_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>;
+
+// Reciprocal estimate
+def : InstRW<[V3Write_4c_2V02], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[V3Write_9c_2V_4V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[V3Write_8c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[V3Write_6c_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[V3Write_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[V3Write_6c_1V_1V13], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>;
+
+// Reverse, vector
+def : InstRW<[V3Write_2c_1V], (instregex "^REV_ZZ_[BHSD]",
+                                         "^REVB_ZPmZ_[HSD]",
+                                         "^REVH_ZPmZ_[SD]",
+                                         "^REVW_ZPmZ_D")>;
+
+// Select, vector form
+def : InstRW<[V3Write_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>;
+
+// Table lookup
+def : InstRW<[V3Write_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>;
+
+// Table lookup extension
+def : InstRW<[V3Write_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>;
+
+// Transpose, vector form
+def : InstRW<[V3Write_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+
+// Unpack and extend
+def : InstRW<[V3Write_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
+
+// Zip/unzip
+def : InstRW<[V3Write_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+
+// §3.26 SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[V3Write_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
+                                         "^FABD_ZPZZ_[HSD]",
+                                         "^FABS_ZPmZ_[HSD]")>;
+
+// Floating point arithmetic
+def : InstRW<[V3Write_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
+                                         "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
+                                         "^FADDP_ZPmZZ_[HSD]",
+                                         "^FNEG_ZPmZ_[HSD]",
+                                         "^FSUBR_ZPm[IZ]_[HSD]",
+                                         "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
+
+// Floating point associative add, F16
+def : InstRW<[V3Write_10c_1V1_9rc], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[V3Write_6c_1V1_5rc], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[V3Write_4c_1V], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[V3Write_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+                                          "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
+                                          "^FCM(LE|LT)_PPzZ0_[HSD]",
+                                          "^FCMUO_PPzZZ_[HSD]")>;
+
+// Floating point complex add
+def : InstRW<[V3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>;
+
+// Floating point complex multiply add
+def : InstRW<[V3Wr_ZFCMA, ReadDefault, V3Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V3Wr_ZFCMA, V3Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
+                                           "^FCVTLT_ZPmZ_HtoS",
+                                           "^FCVTNT_ZPmZ_StoH")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[V3Write_3c_1V02], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
+                                           "^FCVTLT_ZPmZ_StoD",
+                                           "^FCVTNT_ZPmZ_DtoS")>;
+
+// Floating point convert, round to odd
+def : InstRW<[V3Write_3c_1V02], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
+
+// Floating point base2 log, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point base2 log, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point base2 log, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point convert to integer, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[V3Write_3c_1V02],
+             (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
+
+// Floating point copy
+def : InstRW<[V3Write_2c_1V], (instregex "^FCPY_ZPmI_[HSD]",
+                                         "^FDUP_ZI_[HSD]")>;
+
+// Floating point divide, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point divide, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point divide, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point min/max pairwise
+def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+
+// Floating point min/max
+def : InstRW<[V3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
+                                         "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply
+def : InstRW<[V3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
+                                         "^FMULX_ZPZZ_[HSD]",
+                                         "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
+                                         "^FMUL_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply accumulate
+def : InstRW<[V3Wr_ZFMA, ReadDefault, V3Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V3Wr_ZFMA, V3Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[V3Wr_ZFMAL, V3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>;
+
+// Floating point reciprocal estimate, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>;
+
+// Floating point reciprocal step
+def : InstRW<[V3Write_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
+
+// Floating point reduction, F16
+def : InstRW<[V3Write_8c_4V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>;
+
+// Floating point reduction, F32
+def : InstRW<[V3Write_6c_3V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>;
+
+// Floating point reduction, F64
+def : InstRW<[V3Write_4c_2V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>;
+
+// Floating point round to integral, F16
+def : InstRW<[V3Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
+
+// Floating point round to integral, F32
+def : InstRW<[V3Write_4c_2V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
+
+// Floating point round to integral, F64
+def : InstRW<[V3Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
+
+// Floating point square root, F16
+def : InstRW<[V3Write_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>;
+
+// Floating point square root, F32
+def : InstRW<[V3Write_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>;
+
+// Floating point square root, F64
+def : InstRW<[V3Write_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[V3Write_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[V3Write_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[V3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>;
+
+// §3.27 SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[V3Write_4c_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[V3Wr_ZBFDOT, V3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[V3Wr_ZBFMMA, V3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
+
+// Multiply accumulate long
+def : InstRW<[V3Wr_ZBFMAL, V3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>;
+
+// §3.28 SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[V3Write_6c_1L], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[V3Write_6c_1L_1M], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]_IMM$",
+                                         "^LD1S?B_[HSD]_IMM$",
+                                         "^LD1S?H_[SD]_IMM$",
+                                         "^LD1S?W_D_IMM$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1[BHWD]$",
+                                         "^LD1S?B_[HSD]$",
+                                         "^LD1S?H_[SD]$",
+                                         "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1R[BHWD]_IMM$",
+                                         "^LD1RS?B_[HSD]_IMM$",
+                                         "^LD1RS?H_[SD]_IMM$",
+                                         "^LD1RW_D_IMM$",
+                                         "^LD1RSW_IMM$",
+                                         "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+// Non temporal load, scalar + scalar
+def : InstRW<[V3Write_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[V3Write_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$",
+                                            "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[V3Write_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[V3Write_9c_2L_2V], (instrs LDNT1D_ZZR_D)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[V3Write_6c_1L_1I], (instregex "^LDFF1[BHWD]$",
+                                            "^LDFF1S?B_[HSD]$",
+                                            "^LDFF1S?H_[SD]$",
+                                            "^LDFF1S?W_D$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[V3Write_6c_1L], (instregex "^LDNF1[BHWD]_IMM$",
+                                         "^LDNF1S?B_[HSD]_IMM$",
+                                         "^LDNF1S?H_[SD]_IMM$",
+                                         "^LDNF1S?W_D_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[V3Write_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[V3Write_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[V3Write_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[V3Write_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[V3Write_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[V3Write_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
+                                            "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+                                            "^GLD(FF)?1D_IMM$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[V3Write_10c_1L_8V],
+             (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$",
+                        "^GLD(FF)?1W_[SU]XTW_SCALED")>;
+
+// Gather load, 64-bit scaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3Write_10c_1L_4V],
+             (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$",
+                        "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[V3Write_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+                                            "^GLD(FF)?1W_[SU]XTW$")>;
+
+// Gather load, 64-bit unpacked unscaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3Write_9c_1L_2V],
+             (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$",
+                        "^GLD(FF)?1D(_[SU]XTW)?$")>;
+
+// §3.29 SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[V3Write_1c_1SA], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[V3Write_2c_1SA_1V01], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BHWD]_IMM$",
+                                                "^ST1B_[HSD]_IMM$",
+                                                "^ST1H_[SD]_IMM$",
+                                                "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[V3Write_2c_1SA_1I_1V01], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^ST1[BWD]$",
+                                                "^ST1B_[HSD]$",
+                                                "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[V3Write_4c_1SA_1V01], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[V3Write_4c_2SA_2I_2V01], (instrs ST2H)>;
+def : InstRW<[V3Write_4c_2SA_2V01], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[V3Write_7c_9SA_9V01], (instregex "^ST3[BHWD]_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[V3Write_7c_9SA_9I_9V01], (instregex "^ST3[BHWD]$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[V3Write_11c_18SA_18V01], (instregex "^ST4[BHWD]_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[V3Write_11c_18SA_18I_18V01], (instregex "^ST4[BHWD]$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[V3Write_2c_1SA_1I_1V01], (instrs STNT1H_ZRR)>;
+def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_IMM$",
+                                                "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_IMM$",
+                                                "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[V3Write_4c_6SA_6V01],
+             (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_[SU]XTW$",
+                                                "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+                                                "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_[SU]XTW$",
+                                                "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_SCALED$",
+                                                "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D$",
+                                                "^SST1D$")>;
+
+// §3.30 SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[V3Write_2c_1M0], (instrs RDFFR_P)>;
+
+// Read first fault register, predicated
+def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFR_PPz)>;
+
+// Read first fault register and set flags
+def : InstRW<[V3Write_3or4c_1M0_1M], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[V3Write_2c_1M0], (instrs SETFFR, WRFFR)>;
+
+// Prefetch
+// NOTE: This is not specified in the SOG.
+def : InstRW<[V3Write_4c_1L], (instregex "^PRF[BHWD]")>;
+
+// §3.31 SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3Write_2c_1V], (instregex "^AES[DE]_ZZZ_B$",
+                                         "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3Write_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$",
+                                         "^RAX1_ZZZ_D$",
+                                         "^XAR_ZZZI_[BHSD]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3Write_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td
new file mode 100644
index 0000000..0f1ec66
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td
@@ -0,0 +1,2705 @@
+//=- AArch64SchedNeoverseV3AE.td - NeoverseV3AE Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse V3AE processors.
+// All information is taken from the V3AE Software Optimisation guide:
+//
+// https://developer.arm.com/documentation/109703/300/?lang=en
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseV3AEModel : SchedMachineModel {
+  let IssueWidth            =  10; // Expect best value to be slightly higher than V2
+  let MicroOpBufferSize     = 320; // Entries in micro-op re-order buffer. NOTE: Copied from Neoverse-V2
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
+  let LoopMicroOpBufferSize =  16; // NOTE: Copied from Cortex-A57.
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+                                                    [HasSVE2p1, HasSVEB16B16,
+                                                     HasCPA, HasCSSC]);
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse V3AE.
+// Instructions are first fetched and then decoded into internal macro-ops
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch
+// stages. A MOP can be split into two micro-ops further down the pipeline
+// after the decode stage. Once dispatched, micro-ops wait for their operands
+// and issue out-of-order to one of nineteen issue pipelines. Each issue
+// pipeline can accept one micro-op per cycle.
+
+let SchedModel = NeoverseV3AEModel in {
+
+// Define the (19) issue ports.
+def V3AEUnitB   : ProcResource<3>;  // Branch 0/1/2
+def V3AEUnitS0  : ProcResource<1>;  // Integer single-cycle 0
+def V3AEUnitS1  : ProcResource<1>;  // Integer single-cycle 1
+def V3AEUnitS2  : ProcResource<1>;  // Integer single-cycle 2
+def V3AEUnitS3  : ProcResource<1>;  // Integer single-cycle 3
+def V3AEUnitS4  : ProcResource<1>;  // Integer single-cycle 4
+def V3AEUnitS5  : ProcResource<1>;  // Integer single-cycle 5
+def V3AEUnitM0  : ProcResource<1>;  // Integer single/multicycle 0
+def V3AEUnitM1  : ProcResource<1>;  // Integer single/multicycle 1
+def V3AEUnitV0  : ProcResource<1>;  // FP/ASIMD 0
+def V3AEUnitV1  : ProcResource<1>;  // FP/ASIMD 1
+def V3AEUnitLS0 : ProcResource<1>;  // Load/Store 0
+def V3AEUnitL12 : ProcResource<2>;  // Load 1/2
+def V3AEUnitST1 : ProcResource<1>;  // Store 1
+def V3AEUnitD   : ProcResource<2>;  // Store data 0/1
+def V3AEUnitFlg : ProcResource<4>;  // Flags
+
+def V3AEUnitS   : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5]>;  // Integer single-cycle 0/1/2/3/4/5
+def V3AEUnitI   : ProcResGroup<[V3AEUnitS0, V3AEUnitS1, V3AEUnitS2, V3AEUnitS3, V3AEUnitS4, V3AEUnitS5, V3AEUnitM0, V3AEUnitM1]>;  // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1
+def V3AEUnitM   : ProcResGroup<[V3AEUnitM0, V3AEUnitM1]>;  // Integer single/multicycle 0/1
+def V3AEUnitLSA : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12, V3AEUnitST1]>; // Supergroup of L+SA
+def V3AEUnitL   : ProcResGroup<[V3AEUnitLS0, V3AEUnitL12]>; // Load/Store 0 and Load 1/2
+def V3AEUnitSA  : ProcResGroup<[V3AEUnitLS0, V3AEUnitST1]>; // Load/Store 0 and Store 1
+def V3AEUnitV   : ProcResGroup<[V3AEUnitV0, V3AEUnitV1]>;  // FP/ASIMD 0/1
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+// NOTE: Copied from N2.
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse V3AE.
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 0 micro-op types
+def V3AEWrite_0c : SchedWriteRes<[]> { let Latency = 0; }
+
+// Define generic 1 micro-op types
+
+def V3AEWrite_1c_1B    : SchedWriteRes<[V3AEUnitB]>   { let Latency = 1; }
+def V3AEWrite_1c_1F_1Flg : SchedWriteRes<[V3AEUnitI, V3AEUnitFlg]>   { let Latency = 1; }
+def V3AEWrite_1c_1I    : SchedWriteRes<[V3AEUnitI]>   { let Latency = 1; }
+def V3AEWrite_1c_1M    : SchedWriteRes<[V3AEUnitM]>   { let Latency = 1; }
+def V3AEWrite_1c_1SA   : SchedWriteRes<[V3AEUnitSA]>  { let Latency = 1; }
+def V3AEWrite_2c_1M    : SchedWriteRes<[V3AEUnitM]>   { let Latency = 2; }
+def V3AEWrite_2c_1M_1Flg : SchedWriteRes<[V3AEUnitM, V3AEUnitFlg]>   { let Latency = 2; }
+def V3AEWrite_3c_1M    : SchedWriteRes<[V3AEUnitM]>   { let Latency = 3; }
+def V3AEWrite_2c_1M0   : SchedWriteRes<[V3AEUnitM0]>  { let Latency = 2; }
+def V3AEWrite_3c_1M0   : SchedWriteRes<[V3AEUnitM0]>  { let Latency = 3; }
+def V3AEWrite_4c_1M0   : SchedWriteRes<[V3AEUnitM0]>  { let Latency = 4; }
+def V3AEWrite_12c_1M0  : SchedWriteRes<[V3AEUnitM0]>  { let Latency = 12;
+                                                    let ReleaseAtCycles = [12]; }
+def V3AEWrite_20c_1M0  : SchedWriteRes<[V3AEUnitM0]>  { let Latency = 20;
+                                                    let ReleaseAtCycles = [20]; }
+def V3AEWrite_4c_1L    : SchedWriteRes<[V3AEUnitL]>   { let Latency = 4; }
+def V3AEWrite_6c_1L    : SchedWriteRes<[V3AEUnitL]>   { let Latency = 6; }
+def V3AEWrite_2c_1V    : SchedWriteRes<[V3AEUnitV]>   { let Latency = 2; }
+def V3AEWrite_2c_1V0   : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 2; }
+def V3AEWrite_3c_1V    : SchedWriteRes<[V3AEUnitV]>   { let Latency = 3; }
+def V3AEWrite_4c_1V    : SchedWriteRes<[V3AEUnitV]>   { let Latency = 4; }
+def V3AEWrite_5c_1V    : SchedWriteRes<[V3AEUnitV]>   { let Latency = 5; }
+def V3AEWrite_6c_1V    : SchedWriteRes<[V3AEUnitV]>   { let Latency = 6; }
+def V3AEWrite_12c_1V   : SchedWriteRes<[V3AEUnitV]>   { let Latency = 12; }
+def V3AEWrite_3c_1V0   : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 3; }
+def V3AEWrite_4c_1V0   : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 4; }
+def V3AEWrite_9c_1V0   : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 9; }
+def V3AEWrite_10c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 10; }
+def V3AEWrite_8c_1V1   : SchedWriteRes<[V3AEUnitV1]> { let Latency = 8; }
+def V3AEWrite_12c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 12;
+                                                    let ReleaseAtCycles = [11]; }
+def V3AEWrite_13c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 13; }
+def V3AEWrite_15c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 15; }
+def V3AEWrite_13c_1V1  : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13;
+                                                   let ReleaseAtCycles = [8]; }
+def V3AEWrite_16c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 16; }
+def V3AEWrite_20c_1V0  : SchedWriteRes<[V3AEUnitV0]>  { let Latency = 20;
+                                                    let ReleaseAtCycles = [20]; }
+def V3AEWrite_2c_1V1   : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 2; }
+def V3AEWrite_3c_1V1   : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 3; }
+def V3AEWrite_4c_1V1   : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 4; }
+def V3AEWrite_6c_1V1   : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 6; }
+def V3AEWrite_10c_1V1  : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 10; }
+def V3AEWrite_6c_1SA   : SchedWriteRes<[V3AEUnitSA]>  { let Latency = 6; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def V3AEWrite_1c_1B_1S : SchedWriteRes<[V3AEUnitB, V3AEUnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1M0_1B : SchedWriteRes<[V3AEUnitM0, V3AEUnitB]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_9c_1M0_1L : SchedWriteRes<[V3AEUnitM0, V3AEUnitL]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1I_1M : SchedWriteRes<[V3AEUnitI, V3AEUnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_7c_1I_1L : SchedWriteRes<[V3AEUnitI, V3AEUnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_1SA_1D : SchedWriteRes<[V3AEUnitSA, V3AEUnitD]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_2V  : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1V1_1V : SchedWriteRes<[V3AEUnitV1, V3AEUnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2V : SchedWriteRes<[V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2L : SchedWriteRes<[V3AEUnitL, V3AEUnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_8c_1L_1V : SchedWriteRes<[V3AEUnitL, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_1SA_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1M0_1M  : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_4c_1M0_1M  : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_1c_1M0_1M  : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1M0_1M  : SchedWriteRes<[V3AEUnitM0, V3AEUnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_2V1 : SchedWriteRes<[V3AEUnitV1, V3AEUnitV1]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_2V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_5c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1V1_1M0 : SchedWriteRes<[V3AEUnitV1, V3AEUnitM0]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_7c_1M0_1V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitV0]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_2c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_3c_1V0_1M : SchedWriteRes<[V3AEUnitV0, V3AEUnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1V_1V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV1]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1L_1M : SchedWriteRes<[V3AEUnitL, V3AEUnitM]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_6c_1L_1I : SchedWriteRes<[V3AEUnitL, V3AEUnitI]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
+def V3AEWrite_8c_1M0_1V : SchedWriteRes<[V3AEUnitM0, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def V3AEWrite_1c_1SA_1D_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitD, V3AEUnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_1V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_4c_1SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_9c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_4c_3V  : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_7c_1M_1M0_1V : SchedWriteRes<[V3AEUnitM, V3AEUnitM0, V3AEUnitV]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_2c_1SA_1I_1V : SchedWriteRes<[V3AEUnitSA, V3AEUnitI, V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_6c_3L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_6c_3V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+
+def V3AEWrite_8c_1L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def V3AEWrite_2c_1SA_2V_1I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_5c_1I_3L : SchedWriteRes<[V3AEUnitI, V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_4V0 : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0, V3AEUnitV0, V3AEUnitV0]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+                                         V3AEUnitV1]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_4V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_9c_2L_2V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_2c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+                                         V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_4c_2SA_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+                                         V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2M0_2V0 : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitV0,
+                                          V3AEUnitV0]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_8c_2V_2V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+                                         V3AEUnitV1]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_4c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM,
+                                         V3AEUnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_5c_2M0_2M : SchedWriteRes<[V3AEUnitM0, V3AEUnitM0, V3AEUnitM,
+                                         V3AEUnitM]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_2I_2L : SchedWriteRes<[V3AEUnitI, V3AEUnitI, V3AEUnitL, V3AEUnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_7c_4L : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+
+def V3AEWrite_6c_1SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def V3AEWrite_2c_1SA_2V_2I : SchedWriteRes<[V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitI, V3AEUnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 5;
+}
+
+def V3AEWrite_8c_2L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+
+def V3AEWrite_9c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+def V3AEWrite_10c_1L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 5;
+}
+
+def V3AEWrite_6c_5V : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                     V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def V3AEWrite_8c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_3L_3V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2L_2V_2I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitV,
+                                           V3AEUnitV, V3AEUnitI, V3AEUnitI]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_9c_2V_4V1 : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV1,
+                                         V3AEUnitV1, V3AEUnitV1, V3AEUnitV1]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_2c_3SA_3V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_4c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_5c_2SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 6;
+}
+
+def V3AEWrite_4c_2SA_2I_2V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitI,
+                                            V3AEUnitI, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def V3AEWrite_8c_3L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 7;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def V3AEWrite_2c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                        V3AEUnitSA, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 8;
+}
+
+def V3AEWrite_4c_4SA_4V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 8;
+}
+
+def V3AEWrite_6c_2SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 8;
+}
+
+def V3AEWrite_8c_4L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def V3AEWrite_6c_3SA_6V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 9;
+}
+
+def V3AEWrite_10c_1L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 9;
+}
+
+def V3AEWrite_10c_3V_3L_3I : SchedWriteRes<[V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                            V3AEUnitI, V3AEUnitI, V3AEUnitI]> {
+  let Latency     = 10;
+  let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def V3AEWrite_9c_6L_4V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitL, V3AEUnitL, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def V3AEWrite_5c_4SA_8V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 12;
+}
+
+def V3AEWrite_9c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                        V3AEUnitL, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                        V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 12;
+}
+
+def V3AEWrite_10c_4L_8V : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                         V3AEUnitL, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def V3AEWrite_7c_4SA_12V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                          V3AEUnitSA, V3AEUnitV, V3AEUnitV,
+                                          V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                          V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                          V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                          V3AEUnitV]> {
+  let Latency     = 7;
+  let NumMicroOps = 16;
+}
+
+def V3AEWrite_10c_4L_8V_4I : SchedWriteRes<[V3AEUnitL, V3AEUnitL, V3AEUnitL,
+                                            V3AEUnitL, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                            V3AEUnitI]> {
+  let Latency     = 10;
+  let NumMicroOps = 16;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def V3AEWrite_7c_9SA_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                         V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 7;
+  let NumMicroOps = 18;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 27 micro-op types
+
+def V3AEWrite_7c_9SA_9I_9V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                            V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                            V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 7;
+  let NumMicroOps = 27;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 36 micro-op types
+
+def V3AEWrite_11c_18SA_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitSA, V3AEUnitSA, V3AEUnitSA,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                            V3AEUnitV, V3AEUnitV, V3AEUnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 36;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 54 micro-op types
+
+def V3AEWrite_11c_18SA_18I_18V : SchedWriteRes<[V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitSA, V3AEUnitSA,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitI, V3AEUnitI, V3AEUnitI,
+                                                V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV, V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV, V3AEUnitV,
+                                                V3AEUnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 54;
+}
+
+//===----------------------------------------------------------------------===//
+// Define predicate-controlled types
+
+def V3AEWrite_ArithI : SchedWriteVariant<[
+                       SchedVar<IsCheapLSL,  [V3AEWrite_1c_1I]>,
+                       SchedVar<NoSchedPred, [V3AEWrite_2c_1M]>]>;
+
+def V3AEWrite_ArithF : SchedWriteVariant<[
+                       SchedVar<IsCheapLSL,  [V3AEWrite_1c_1F_1Flg]>,
+                       SchedVar<NoSchedPred, [V3AEWrite_2c_1M_1Flg]>]>;
+
+def V3AEWrite_Logical : SchedWriteVariant<[
+                        SchedVar<NeoverseNoLSL, [V3AEWrite_1c_1F_1Flg]>,
+                        SchedVar<NoSchedPred,   [V3AEWrite_2c_1M_1Flg]>]>;
+
+def V3AEWrite_Extr : SchedWriteVariant<[
+                     SchedVar<IsRORImmIdiomPred, [V3AEWrite_1c_1I]>,
+                     SchedVar<NoSchedPred,       [V3AEWrite_3c_1I_1M]>]>;
+
+def V3AEWrite_LdrHQ : SchedWriteVariant<[
+                      SchedVar<NeoverseHQForm,  [V3AEWrite_7c_1I_1L]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_6c_1L]>]>;
+
+def V3AEWrite_StrHQ : SchedWriteVariant<[
+                      SchedVar<NeoverseHQForm,  [V3AEWrite_2c_1SA_1V_1I]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_2c_1SA_1V]>]>;
+
+def V3AEWrite_0or1c_1I : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+                      SchedVar<NoSchedPred,      [V3AEWrite_1c_1I]>]>;
+
+def V3AEWrite_0or2c_1V : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+                      SchedVar<NoSchedPred,      [V3AEWrite_2c_1V]>]>;
+
+def V3AEWrite_0or3c_1M0 : SchedWriteVariant<[
+                      SchedVar<NeoverseZeroMove, [V3AEWrite_0c]>,
+                      SchedVar<NoSchedPred,      [V3AEWrite_3c_1M0]>]>;
+
+def V3AEWrite_2or3c_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3AEWrite_3c_1M]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_2c_1M]>]>;
+
+def V3AEWrite_1or2c_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3AEWrite_2c_1M]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_1c_1M]>]>;
+
+def V3AEWrite_3or4c_1M0_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3AEWrite_4c_1M0_1M]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_3c_1M0_1M]>]>;
+
+def V3AEWrite_2or3c_1V0 : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3AEWrite_3c_1V0]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_2c_1V0]>]>;
+
+def V3AEWrite_2or3c_1V0_1M : SchedWriteVariant<[
+                      SchedVar<NeoversePdIsPg,  [V3AEWrite_3c_1V0_1M]>,
+                      SchedVar<NoSchedPred,     [V3AEWrite_2c_1V0_1M]>]>;
+
+def V3AEWrite_IncDec : SchedWriteVariant<[
+                      SchedVar<NeoverseCheapIncDec, [V3AEWrite_1c_1I]>,
+                      SchedVar<NoSchedPred,         [V3AEWrite_2c_1M]>]>;
+
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V3AEWr_IM   : SchedWriteRes<[V3AEUnitM]>  { let Latency = 2; }
+
+def V3AEWr_FMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_FMA : SchedReadAdvance<2, [WriteFMul, V3AEWr_FMA]>;
+
+def V3AEWr_VA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VA : SchedReadAdvance<3, [V3AEWr_VA]>;
+
+def V3AEWr_VDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_VDOT : SchedReadAdvance<2, [V3AEWr_VDOT]>;
+
+def V3AEWr_VMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_VMMA : SchedReadAdvance<2, [V3AEWr_VMMA]>;
+
+def V3AEWr_VMA : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMA : SchedReadAdvance<3, [V3AEWr_VMA]>;
+
+def V3AEWr_VMAH : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMAH : SchedReadAdvance<2, [V3AEWr_VMAH]>;
+
+def V3AEWr_VMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_VMAL : SchedReadAdvance<3, [V3AEWr_VMAL]>;
+
+def V3AEWr_VPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VPA : SchedReadAdvance<3, [V3AEWr_VPA]>;
+
+def V3AEWr_VSA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VSA : SchedReadAdvance<3, [V3AEWr_VSA]>;
+
+def V3AEWr_VFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFCMA : SchedReadAdvance<2, [V3AEWr_VFCMA]>;
+
+def V3AEWr_VFM  : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AEWr_VFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFMA : SchedReadAdvance<2, [V3AEWr_VFM, V3AEWr_VFMA]>;
+
+def V3AEWr_VFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_VFMAL : SchedReadAdvance<2, [V3AEWr_VFMAL]>;
+
+def V3AEWr_VBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_VBFDOT : SchedReadAdvance<2, [V3AEWr_VBFDOT]>;
+def V3AEWr_VBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; }
+def V3AERd_VBFMMA : SchedReadAdvance<2, [V3AEWr_VBFMMA]>;
+def V3AEWr_VBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_VBFMAL : SchedReadAdvance<3, [V3AEWr_VBFMAL]>;
+
+def V3AEWr_CRC : SchedWriteRes<[V3AEUnitM0]> { let Latency = 2; }
+def V3AERd_CRC : SchedReadAdvance<1, [V3AEWr_CRC]>;
+
+def V3AEWr_ZA  : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZA  : SchedReadAdvance<3, [V3AEWr_ZA]>;
+def V3AEWr_ZPA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZPA : SchedReadAdvance<3, [V3AEWr_ZPA]>;
+def V3AEWr_ZSA : SchedWriteRes<[V3AEUnitV1]> { let Latency = 4; }
+def V3AERd_ZSA : SchedReadAdvance<3, [V3AEWr_ZSA]>;
+
+def V3AEWr_ZDOTB : SchedWriteRes<[V3AEUnitV]>   { let Latency = 3; }
+def V3AERd_ZDOTB : SchedReadAdvance<2, [V3AEWr_ZDOTB]>;
+def V3AEWr_ZDOTH : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; }
+def V3AERd_ZDOTH : SchedReadAdvance<2, [V3AEWr_ZDOTH]>;
+
+// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
+// throughput to 1 in case of forwarding?
+def V3AEWr_ZCMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZCMABHS : SchedReadAdvance<3, [V3AEWr_ZCMABHS]>;
+def V3AEWr_ZCMAD   : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZCMAD   : SchedReadAdvance<2, [V3AEWr_ZCMAD]>;
+
+def V3AEWr_ZMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
+def V3AERd_ZMMA : SchedReadAdvance<2, [V3AEWr_ZMMA]>;
+
+def V3AEWr_ZMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZMABHS : SchedReadAdvance<3, [V3AEWr_ZMABHS]>;
+def V3AEWr_ZMAD  : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZMAD  : SchedReadAdvance<2, [V3AEWr_ZMAD]>;
+
+def V3AEWr_ZMAL : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
+def V3AERd_ZMAL : SchedReadAdvance<3, [V3AEWr_ZMAL]>;
+
+def V3AEWr_ZMASQL   : SchedWriteRes<[V3AEUnitV0]>            { let Latency = 4; }
+def V3AEWr_ZMASQBHS : SchedWriteRes<[V3AEUnitV0]>            { let Latency = 4; }
+def V3AEWr_ZMASQD   : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
+def V3AERd_ZMASQ    : SchedReadAdvance<2, [V3AEWr_ZMASQL, V3AEWr_ZMASQBHS,
+                                         V3AEWr_ZMASQD]>;
+
+def V3AEWr_ZFCMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZFCMA : SchedReadAdvance<3, [V3AEWr_ZFCMA]>;
+
+def V3AEWr_ZFMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZFMA : SchedReadAdvance<2, [V3AEWr_ZFMA]>;
+
+def V3AEWr_ZFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 4; }
+def V3AERd_ZFMAL : SchedReadAdvance<2, [V3AEWr_ZFMAL]>;
+
+def V3AEWr_ZBFDOT : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZBFDOT : SchedReadAdvance<2, [V3AEWr_ZBFDOT]>;
+def V3AEWr_ZBFMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 6; }
+def V3AERd_ZBFMMA : SchedReadAdvance<2, [V3AEWr_ZBFMMA]>;
+def V3AEWr_ZBFMAL : SchedWriteRes<[V3AEUnitV]> { let Latency = 5; }
+def V3AERd_ZBFMAL : SchedReadAdvance<3, [V3AEWr_ZBFMAL]>;
+
+//===----------------------------------------------------------------------===//
+// Define types with long resource cycles (rc)
+
+def V3AEWrite_6c_1V1_5rc    : SchedWriteRes<[V3AEUnitV1]>  { let Latency =  6; let ReleaseAtCycles = [ 5]; }
+def V3AEWrite_9c_1V1_2rc    : SchedWriteRes<[V3AEUnitV1]>  { let Latency =  9; let ReleaseAtCycles = [ 2]; }
+def V3AEWrite_9c_1V1_4rc    : SchedWriteRes<[V3AEUnitV1]>  { let Latency =  9; let ReleaseAtCycles = [ 4]; }
+def V3AEWrite_10c_1V1_9rc   : SchedWriteRes<[V3AEUnitV1]>  { let Latency = 10; let ReleaseAtCycles = [ 9]; }
+def V3AEWrite_11c_1V1_4rc  : SchedWriteRes<[V3AEUnitV1]> { let Latency = 11; let ReleaseAtCycles = [ 4]; }
+def V3AEWrite_13c_1V1_8rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 13; let ReleaseAtCycles = [8]; }
+def V3AEWrite_14c_1V1_2rc : SchedWriteRes<[V3AEUnitV1]> { let Latency = 14; let ReleaseAtCycles = [2]; }
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// §3.3 Branch instructions
+// -----------------------------------------------------------------------------
+
+// Branch, immed
+// Compare and branch
+def : SchedAlias<WriteBr,    V3AEWrite_1c_1B>;
+
+// Branch, register
+def : SchedAlias<WriteBrReg, V3AEWrite_1c_1B>;
+
+// Branch and link, immed
+// Branch and link, register
+def : InstRW<[V3AEWrite_1c_1B_1S], (instrs BL, BLR)>;
+
+// §3.4 Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+// ALU, basic
+def : SchedAlias<WriteI, V3AEWrite_1c_1I>;
+
+// ALU, basic, flagset
+def : InstRW<[V3AEWrite_1c_1F_1Flg],
+             (instregex "^(ADD|SUB)S[WX]r[ir]$",
+                        "^(ADC|SBC)S[WX]r$",
+                        "^ANDS[WX]ri$",
+                        "^(AND|BIC)S[WX]rr$")>;
+def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^MOVZ[WX]i$")>;
+
+// ALU, extend and shift
+def : SchedAlias<WriteIEReg, V3AEWrite_2c_1M>;
+
+// Arithmetic, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
+def : SchedAlias<WriteISReg, V3AEWrite_ArithI>;
+def : InstRW<[V3AEWrite_ArithF],
+             (instregex "^(ADD|SUB)S[WX]rs$")>;
+
+// Arithmetic, immediate to logical address tag
+def : InstRW<[V3AEWrite_2c_1M], (instrs ADDG, SUBG)>;
+
+// Conditional compare
+def : InstRW<[V3AEWrite_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>;
+
+// Convert floating-point condition flags
+// Flag manipulation instructions
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+// Insert Random Tags
+def : InstRW<[V3AEWrite_2c_1M], (instrs IRG, IRGstack)>;
+
+// Insert Tag Mask
+// Subtract Pointer
+def : InstRW<[V3AEWrite_1c_1I], (instrs GMI, SUBP)>;
+
+// Subtract Pointer, flagset
+def : InstRW<[V3AEWrite_1c_1F_1Flg], (instrs SUBPS)>;
+
+// Logical, shift, no flagset
+def : InstRW<[V3AEWrite_1c_1I],    (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
+def : InstRW<[V3AEWrite_0or1c_1I], (instregex "^ORR[WX]rs$")>;
+
+// Logical, shift, flagset
+def : InstRW<[V3AEWrite_Logical], (instregex "^(AND|BIC)S[WX]rs$")>;
+
+// Move and shift instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteImm, V3AEWrite_1c_1I>;
+
+// §3.5 Divide and multiply instructions
+// -----------------------------------------------------------------------------
+
+// SDIV, UDIV
+def : SchedAlias<WriteID32,  V3AEWrite_12c_1M0>;
+def : SchedAlias<WriteID64,  V3AEWrite_20c_1M0>;
+
+def : SchedAlias<WriteIM32, V3AEWrite_2c_1M>;
+def : SchedAlias<WriteIM64, V3AEWrite_2c_1M>;
+
+// Multiply
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+def : InstRW<[V3AEWr_IM], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V3AEWr_IM], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
+// Multiply high
+def : InstRW<[V3AEWrite_3c_1M], (instrs SMULHrr, UMULHrr)>;
+
+// §3.6 Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[V3AEWrite_4c_1M0], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[V3AEWrite_6c_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+                                            BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+                                            ERETAA, ERETAB)>;
+
+
+// Load register, with pointer authentication
+def : InstRW<[V3AEWrite_9c_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[V3AEWrite_2c_1M0], (instrs XPACD, XPACI, XPACLRI)>;
+
+// §3.7 Miscellaneous data-processing instructions
+// -----------------------------------------------------------------------------
+
+// Address generation
+def : InstRW<[V3AEWrite_1c_1I], (instrs ADR, ADRP)>;
+
+// Bitfield extract, one reg
+// Bitfield extract, two regs
+def : SchedAlias<WriteExtr, V3AEWrite_Extr>;
+def : InstRW<[V3AEWrite_Extr], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitfield move, basic
+def : SchedAlias<WriteIS, V3AEWrite_1c_1I>;
+
+// Bitfield move, insert
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^BFM[WX]ri$")>;
+
+// §3.8 Load instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3.
+
+def : SchedAlias<WriteLD,    V3AEWrite_4c_1L>;
+def : SchedAlias<WriteLDIdx, V3AEWrite_4c_1L>;
+
+// Load register, literal
+def : InstRW<[V3AEWrite_5c_1L_1I], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
+
+// Load pair, signed immed offset, signed words
+def : InstRW<[V3AEWrite_5c_1I_3L, WriteLDHi], (instrs LDPSWi)>;
+
+// Load pair, immed post-index or immed pre-index, signed words
+def : InstRW<[WriteAdr, V3AEWrite_5c_1I_3L, WriteLDHi],
+             (instregex "^LDPSW(post|pre)$")>;
+
+// §3.9 Store instructions
+// -----------------------------------------------------------------------------
+
+// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I.
+
+def : SchedAlias<WriteST,    V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteSTIdx, V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteSTP,   V3AEWrite_1c_1SA_1D>;
+def : SchedAlias<WriteAdr,   V3AEWrite_1c_1I>;
+
+// §3.10 Tag load instructions
+// -----------------------------------------------------------------------------
+
+// Load allocation tag
+// Load multiple allocation tags
+def : InstRW<[V3AEWrite_4c_1L], (instrs LDG, LDGM)>;
+
+// §3.11 Tag store instructions
+// -----------------------------------------------------------------------------
+
+// Store allocation tags to one or two granules, post-index
+// Store allocation tags to one or two granules, pre-index
+// Store allocation tag to one or two granules, zeroing, post-index
+// Store Allocation Tag to one or two granules, zeroing, pre-index
+// Store allocation tag and reg pair to memory, post-Index
+// Store allocation tag and reg pair to memory, pre-Index
+def : InstRW<[V3AEWrite_1c_1SA_1D_1I], (instrs STGPreIndex, STGPostIndex,
+                                                ST2GPreIndex, ST2GPostIndex,
+                                                STZGPreIndex, STZGPostIndex,
+                                                STZ2GPreIndex, STZ2GPostIndex,
+                                                STGPpre, STGPpost)>;
+
+// Store allocation tags to one or two granules, signed offset
+// Store allocation tag to two granules, zeroing, signed offset
+// Store allocation tag and reg pair to memory, signed offset
+// Store multiple allocation tags
+def : InstRW<[V3AEWrite_1c_1SA_1D], (instrs STGi, ST2Gi, STZGi,
+                                             STZ2Gi, STGPi, STGM, STZGM)>;
+
+// §3.12 FP data processing instructions
+// -----------------------------------------------------------------------------
+
+// FP absolute value
+// FP arithmetic
+// FP min/max
+// FP negate
+// FP select
+def : SchedAlias<WriteF,     V3AEWrite_2c_1V>;
+
+// FP compare
+def : SchedAlias<WriteFCmp,  V3AEWrite_2c_1V0>;
+
+// FP divide, square root
+def : SchedAlias<WriteFDiv,  V3AEWrite_6c_1V1>;
+
+// FP divide, H-form
+def : InstRW<[V3AEWrite_6c_1V1],  (instrs FDIVHrr)>;
+// FP divide, S-form
+def : InstRW<[V3AEWrite_8c_1V1], (instrs FDIVSrr)>;
+// FP divide, D-form
+def : InstRW<[V3AEWrite_13c_1V1], (instrs FDIVDrr)>;
+
+// FP square root, H-form
+def : InstRW<[V3AEWrite_6c_1V1],  (instrs FSQRTHr)>;
+// FP square root, S-form
+def : InstRW<[V3AEWrite_8c_1V1],  (instrs FSQRTSr)>;
+// FP square root, D-form
+def : InstRW<[V3AEWrite_13c_1V1], (instrs FSQRTDr)>;
+
+// FP multiply
+def : WriteRes<WriteFMul, [V3AEUnitV]> { let Latency = 3; }
+
+// FP multiply accumulate
+def : InstRW<[V3AEWr_FMA, ReadDefault, ReadDefault, V3AERd_FMA],
+             (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+
+// FP round to integral
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
+                                            "^FRINT(32|64)[XZ][SD]r$")>;
+
+// §3.13 FP miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// FP convert, from gen to vec reg
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
+
+// FP convert, from vec to gen reg
+def : InstRW<[V3AEWrite_3c_1V0],
+             (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>;
+
+// FP convert, Javascript from vec to gen reg
+def : SchedAlias<WriteFCvt, V3AEWrite_3c_1V0>;
+
+// FP convert, from vec to vec reg
+def : InstRW<[V3AEWrite_3c_1V], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr,
+                                        FCVTHDr, FCVTSDr, FCVTXNv1i64)>;
+
+// FP move, immed
+// FP move, register
+def : SchedAlias<WriteFImm, V3AEWrite_2c_1V>;
+
+// FP transfer, from gen to low half of vec reg
+def : InstRW<[V3AEWrite_0or3c_1M0],
+             (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>;
+
+// FP transfer, from gen to high half of vec reg
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instrs FMOVXDHighr)>;
+
+// FP transfer, from vec to gen reg
+def : SchedAlias<WriteFCopy, V3AEWrite_2c_2V>;
+
+// §3.14 FP load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector reg, literal, S/D/Q forms
+def : InstRW<[V3AEWrite_7c_1I_1L], (instregex "^LDR[SDQ]l$")>;
+
+// Load vector reg, unscaled immed
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDUR[BHSDQ]i$")>;
+
+// Load vector reg, immed post-index
+// Load vector reg, immed pre-index
+def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L],
+             (instregex "^LDR[BHSDQ](pre|post)$")>;
+
+// Load vector reg, unsigned immed
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDR[BHSDQ]ui$")>;
+
+// Load vector reg, register offset, basic
+// Load vector reg, register offset, scale, S/D-form
+// Load vector reg, register offset, scale, H/Q-form
+// Load vector reg, register offset, extend
+// Load vector reg, register offset, extend, scale, S/D-form
+// Load vector reg, register offset, extend, scale, H/Q-form
+def : InstRW<[V3AEWrite_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>;
+
+// Load vector pair, immed offset, S/D-form
+def : InstRW<[V3AEWrite_6c_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
+
+// Load vector pair, immed offset, Q-form
+def : InstRW<[V3AEWrite_6c_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
+
+// Load vector pair, immed post-index, S/D-form
+// Load vector pair, immed pre-index, S/D-form
+def : InstRW<[WriteAdr, V3AEWrite_6c_1I_1L, WriteLDHi],
+             (instregex "^LDP[SD](pre|post)$")>;
+
+// Load vector pair, immed post-index, Q-form
+// Load vector pair, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3AEWrite_6c_2I_2L, WriteLDHi], (instrs LDPQpost,
+                                                                LDPQpre)>;
+
+// §3.15 FP store instructions
+// -----------------------------------------------------------------------------
+
+// Store vector reg, unscaled immed, B/H/S/D-form
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STUR[BHSDQ]i$")>;
+
+// Store vector reg, immed post-index, B/H/S/D-form
+// Store vector reg, immed post-index, Q-form
+// Store vector reg, immed pre-index, B/H/S/D-form
+// Store vector reg, immed pre-index, Q-form
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I],
+             (instregex "^STR[BHSDQ](pre|post)$")>;
+
+// Store vector reg, unsigned immed, B/H/S/D-form
+// Store vector reg, unsigned immed, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STR[BHSDQ]ui$")>;
+
+// Store vector reg, register offset, basic, B/H/S/D-form
+// Store vector reg, register offset, basic, Q-form
+// Store vector reg, register offset, scale, H-form
+// Store vector reg, register offset, scale, S/D-form
+// Store vector reg, register offset, scale, Q-form
+// Store vector reg, register offset, extend, B/H/S/D-form
+// Store vector reg, register offset, extend, Q-form
+// Store vector reg, register offset, extend, scale, H-form
+// Store vector reg, register offset, extend, scale, S/D-form
+// Store vector reg, register offset, extend, scale, Q-form
+def : InstRW<[V3AEWrite_StrHQ, ReadAdrBase],
+             (instregex "^STR[BHSDQ]ro[WX]$")>;
+
+// Store vector pair, immed offset, S-form
+// Store vector pair, immed offset, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STN?P[SD]i$")>;
+
+// Store vector pair, immed offset, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V], (instrs STPQi, STNPQi)>;
+
+// Store vector pair, immed post-index, S-form
+// Store vector pair, immed post-index, D-form
+// Store vector pair, immed pre-index, S-form
+// Store vector pair, immed pre-index, D-form
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V_1I],
+             (instregex "^STP[SD](pre|post)$")>;
+
+// Store vector pair, immed post-index, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V_1I], (instrs STPQpost)>;
+
+// Store vector pair, immed pre-index, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_2V_2I], (instrs STPQpre)>;
+
+// §3.16 ASIMD integer instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD absolute diff
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD arith, pair-wise
+// ASIMD compare
+// ASIMD logical
+// ASIMD max/min, basic and pair-wise
+def : SchedAlias<WriteVd, V3AEWrite_2c_1V>;
+def : SchedAlias<WriteVq, V3AEWrite_2c_1V>;
+
+// ASIMD absolute diff accum
+// ASIMD absolute diff accum long
+def : InstRW<[V3AEWr_VA, V3AERd_VA], (instregex "^[SU]ABAL?v")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[V3AEWrite_5c_1V1_1V],
+             (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
+
+// ASIMD arith, reduce, 16B
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+
+// ASIMD dot product
+// ASIMD dot product using signed and unsigned integers
+def : InstRW<[V3AEWr_VDOT, V3AERd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V3AEWr_VMMA, V3AERd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+                                            "^[SU](MAX|MIN)Vv4i32v$")>;
+
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[V3AEWrite_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+                                               "^[SU](MAX|MIN)Vv8i16v$")>;
+
+// ASIMD max/min, reduce, 16B
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+
+// ASIMD multiply
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
+// ASIMD multiply accumulate
+def : InstRW<[V3AEWr_VMA, V3AERd_VMA], (instregex "^MLAv", "^MLSv")>;
+
+// ASIMD multiply accumulate high
+def : InstRW<[V3AEWr_VMAH, V3AERd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3AEWr_VMAL, V3AERd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDML[AS]L[iv]")>;
+
+// ASIMD multiply/multiply long (8x8) polynomial, D-form
+// ASIMD multiply/multiply long (8x8) polynomial, Q-form
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^PMULL?(v8i8|v16i8)$")>;
+
+// ASIMD multiply long
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>;
+
+// ASIMD pairwise add and accumulate long
+def : InstRW<[V3AEWr_VPA, V3AERd_VPA], (instregex "^[SU]ADALPv")>;
+
+// ASIMD shift accumulate
+def : InstRW<[V3AEWr_VSA, V3AERd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv",
+                                           "^SSHLLv", "^SSHR[dv]", "^USHLLv",
+                                           "^USHR[dv]")>;
+
+// ASIMD shift by immed and insert, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SLI[dv]", "^SRI[dv]")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[V3AEWrite_4c_1V],
+             (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$",
+                        "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
+                        "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]",
+                        "^UQSHRN[bhsv]", "^URSHR[dv]")>;
+
+// ASIMD shift by register, basic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]SHLv")>;
+
+// ASIMD shift by register, complex
+def : InstRW<[V3AEWrite_4c_1V],
+             (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
+                        "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
+
+// §3.17 ASIMD floating-point instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD FP absolute value/difference
+// ASIMD FP arith, normal
+// ASIMD FP compare
+// ASIMD FP complex add
+// ASIMD FP max/min, normal
+// ASIMD FP max/min, pairwise
+// ASIMD FP negate
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD FP complex multiply add
+def : InstRW<[V3AEWr_VFCMA, V3AERd_VFCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP convert, long (F16 to F32)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
+
+// ASIMD FP convert, long (F32 to F64)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTL(v2|v4)i32")>;
+
+// ASIMD FP convert, narrow (F32 to F16)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTN(v4|v8)i16")>;
+
+// ASIMD FP convert, narrow (F64 to F32)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVTN(v2|v4)i32",
+                                            "^FCVTXN(v2|v4)f32")>;
+
+// ASIMD FP convert, other, D-form F32 and Q-form F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$",
+                                            "^FCVT[AMNPZ][SU]v2i(32|64)_shift$",
+                                            "^FCVT[AMNPZ][SU]v1i64$",
+                                            "^FCVTZ[SU]d$",
+                                            "^[SU]CVTFv2f(32|64)$",
+                                            "^[SU]CVTFv2i(32|64)_shift$",
+                                            "^[SU]CVTFv1i64$",
+                                            "^[SU]CVTFd$")>;
+
+// ASIMD FP convert, other, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$",
+                                            "^FCVT[AMNPZ][SU]v4i(16|32)_shift$",
+                                            "^FCVT[AMNPZ][SU]v1i32$",
+                                            "^FCVTZ[SU]s$",
+                                            "^[SU]CVTFv4f(16|32)$",
+                                            "^[SU]CVTFv4i(16|32)_shift$",
+                                            "^[SU]CVTFv1i32$",
+                                            "^[SU]CVTFs$")>;
+
+// ASIMD FP convert, other, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVT[AMNPZ][SU]v8f16$",
+                                            "^FCVT[AMNPZ][SU]v8i16_shift$",
+                                            "^FCVT[AMNPZ][SU]v1f16$",
+                                            "^FCVTZ[SU]h$",
+                                            "^[SU]CVTFv8f16$",
+                                            "^[SU]CVTFv8i16_shift$",
+                                            "^[SU]CVTFv1i16$",
+                                            "^[SU]CVTFh$")>;
+
+// ASIMD FP divide, D-form, F16
+def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FDIVv4f16)>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FDIVv8f16)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, reduce, F32 and D-form F16
+def : InstRW<[V3AEWrite_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
+
+// ASIMD FP max/min, reduce, Q-form F16
+def : InstRW<[V3AEWrite_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
+
+// ASIMD FP multiply
+def : InstRW<[V3AEWr_VFM], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[V3AEWr_VFMA, V3AERd_VFMA], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V3AEWr_VFMAL, V3AERd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>;
+
+// ASIMD FP round, D-form F32 and Q-form F64
+def : InstRW<[V3AEWrite_3c_1V0],
+             (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
+                        "^FRINT(32|64)[XZ]v2f(32|64)$")>;
+
+// ASIMD FP round, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0],
+             (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
+                        "^FRINT(32|64)[XZ]v4f32$")>;
+
+// ASIMD FP round, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
+
+// ASIMD FP square root, D-form, F16
+def : InstRW<[V3AEWrite_9c_1V1_4rc], (instrs FSQRTv4f16)>;
+
+// ASIMD FP square root, D-form, F32
+def : InstRW<[V3AEWrite_9c_1V1_2rc], (instrs FSQRTv2f32)>;
+
+// ASIMD FP square root, Q-form, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instrs FSQRTv8f16)>;
+
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instrs FSQRTv4f32)>;
+
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instrs FSQRTv2f64)>;
+
+// §3.18 ASIMD BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD convert, F32 to BF16
+def : InstRW<[V3AEWrite_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
+
+// ASIMD dot product
+def : InstRW<[V3AEWr_VBFDOT, V3AERd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+
+// ASIMD matrix multiply accumulate
+def : InstRW<[V3AEWr_VBFMMA, V3AERd_VBFMMA], (instrs BFMMLA)>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[V3AEWr_VBFMAL, V3AERd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
+                                                     BFMLALTIdx)>;
+
+// Scalar convert, F32 to BF16
+def : InstRW<[V3AEWrite_3c_1V0], (instrs BFCVT)>;
+
+// §3.19 ASIMD miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD bit reverse
+// ASIMD bitwise insert
+// ASIMD count
+// ASIMD duplicate, element
+// ASIMD extract
+// ASIMD extract narrow
+// ASIMD insert, element to element
+// ASIMD move, FP immed
+// ASIMD move, integer immed
+// ASIMD reverse
+// ASIMD table lookup extension, 1 table reg
+// ASIMD transpose
+// ASIMD unzip/zip
+// Handled by SchedAlias<WriteV[dq], ...>
+def : InstRW<[V3AEWrite_0or2c_1V], (instrs MOVID, MOVIv2d_ns)>;
+
+// ASIMD duplicate, gen reg
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
+
+// ASIMD reciprocal and square root estimate, D-form U32
+def : InstRW<[V3AEWrite_3c_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form U32
+def : InstRW<[V3AEWrite_4c_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
+def : InstRW<[V3AEWrite_3c_1V0], (instrs FRECPEv1f16, FRECPEv1i32,
+                                         FRECPEv1i64, FRECPEv2f32,
+                                         FRSQRTEv1f16, FRSQRTEv1i32,
+                                         FRSQRTEv1i64, FRSQRTEv2f32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
+def : InstRW<[V3AEWrite_4c_2V0], (instrs FRECPEv4f16, FRECPEv4f32,
+                                         FRSQRTEv4f16, FRSQRTEv4f32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form F16
+def : InstRW<[V3AEWrite_6c_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
+
+// ASIMD reciprocal exponent
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRECPXv")>;
+
+// ASIMD reciprocal step
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^FRECPS(32|64|v)",
+                                         "^FRSQRTS(32|64|v)")>;
+
+// ASIMD table lookup, 1 or 2 table regs
+def : InstRW<[V3AEWrite_2c_1V], (instrs TBLv8i8One, TBLv16i8One,
+                                      TBLv8i8Two, TBLv16i8Two)>;
+
+// ASIMD table lookup, 3 table regs
+def : InstRW<[V3AEWrite_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
+
+// ASIMD table lookup, 4 table regs
+def : InstRW<[V3AEWrite_4c_3V], (instrs TBLv8i8Four, TBLv16i8Four)>;
+
+// ASIMD table lookup extension, 2 table reg
+def : InstRW<[V3AEWrite_4c_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// ASIMD table lookup extension, 3 table reg
+def : InstRW<[V3AEWrite_6c_3V], (instrs TBXv8i8Three, TBXv16i8Three)>;
+
+// ASIMD table lookup extension, 4 table reg
+def : InstRW<[V3AEWrite_6c_5V], (instrs TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[V3AEWrite_2c_2V], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
+
+// §3.20 ASIMD load instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1L],
+             (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1L],
+             (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2L],
+             (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3AEWrite_6c_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2L],
+             (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3L],
+             (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3AEWrite_6c_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3L],
+             (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4L],
+             (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3AEWrite_7c_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4L],
+             (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[V3AEWrite_8c_1L_1V],           (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_1L_1V],           (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[V3AEWrite_8c_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_1L_2V],           (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 2 element, multiple, Q-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_8c_2L_2V],           (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[V3AEWrite_8c_1L_2V],           (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_1L_2V],            (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V],  (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[V3AEWrite_8c_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_2L_3V],           (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_8c_3L_3V],           (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lane, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[V3AEWrite_8c_2L_3V],           (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_2L_3V],           (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[V3AEWrite_8c_3L_3V],           (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_8c_3L_4V],           (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_9c_6L_4V],           (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_9c_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[V3AEWrite_8c_3L_4V],           (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+def : InstRW<[V3AEWrite_8c_3L_4V],           (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[V3AEWrite_8c_4L_4V],           (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_8c_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+// §3.21 ASIMD store instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V],           (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[V3AEWrite_2c_1SA_1V],           (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+def : InstRW<[V3AEWrite_2c_1SA_1V],           (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_1SA_1V], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[V3AEWrite_2c_2SA_2V],           (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+def : InstRW<[V3AEWrite_2c_2SA_2V],           (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[V3AEWrite_2c_3SA_3V],           (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_3SA_3V], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+def : InstRW<[V3AEWrite_2c_2SA_2V],           (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_2SA_2V], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[V3AEWrite_2c_4SA_4V],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_2c_4SA_4V], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[V3AEWrite_4c_1SA_2V],           (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_4c_1SA_2V],           (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_4c_2SA_4V],           (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[V3AEWrite_4c_1SA_2V],           (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_1SA_2V], (instregex "ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_5c_2SA_4V],           (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_6c_3SA_6V],           (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_3SA_6V], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[V3AEWrite_5c_2SA_4V],           (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_2SA_4V], (instregex "ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+def : InstRW<[V3AEWrite_6c_2SA_6V],           (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_2SA_6V], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+def : InstRW<[V3AEWrite_7c_4SA_12V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_7c_4SA_12V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[V3AEWrite_5c_4SA_8V],           (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_5c_4SA_8V], (instregex "ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H/S
+def : InstRW<[V3AEWrite_6c_1SA_3V],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_6c_1SA_3V], (instregex "ST4i(8|16|32)_POST$")>;
+
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[V3AEWrite_4c_2SA_4V],            (instregex "ST4i(64)$")>;
+def : InstRW<[WriteAdr, V3AEWrite_4c_2SA_4V],  (instregex "ST4i(64)_POST$")>;
+
+// §3.22 Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[V3AEWrite_2c_1V], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3AEWrite_2c_1V], (instrs BCAX, EOR3, RAX1, XAR)>;
+
+// Crypto SM3 ops
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+                                            "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3AEWrite_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
+
+// §3.23 CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[V3AEWr_CRC, V3AERd_CRC], (instregex "^CRC32")>;
+
+// §3.24 SVE Predicate instructions
+// -----------------------------------------------------------------------------
+
+// Loop control, based on predicate
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKA_PPmP, BRKA_PPzP,
+                                         BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKN_PPzP, BRKPA_PPzPP,
+                                         BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[V3AEWrite_2or3c_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
+                                         BRKPBS_PPzPP)>;
+
+// Loop control, based on GPR
+def : InstRW<[V3AEWrite_3c_2M],
+             (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
+def : InstRW<[V3AEWrite_3c_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+
+// Loop terminate
+def : InstRW<[V3AEWrite_1c_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
+
+// Predicate counting scalar
+def : InstRW<[V3AEWrite_2c_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+def : InstRW<[V3AEWrite_2c_1M],
+             (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI",
+                        "^SQ(DEC|INC)[BHWD]_XPiWdI",
+                        "^UQ(DEC|INC)[BHWD]_WPiI")>;
+
+// Predicate counting scalar, ALL, {1,2,4}
+def : InstRW<[V3AEWrite_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[V3AEWrite_2c_1M],
+             (instregex "^CNTP_XPP_[BHSD]",
+                        "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
+                        "^(UQDEC|UQINC)P_WP_[BHSD]",
+                        "^(SQDEC|SQINC)P_XPWd_[BHSD]")>;
+
+// Predicate counting vector, active predicate
+def : InstRW<[V3AEWrite_7c_1M_1M0_1V],
+             (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
+
+// Predicate logical
+def : InstRW<[V3AEWrite_1or2c_1M],
+             (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
+
+// Predicate logical, flag setting
+def : InstRW<[V3AEWrite_1or2c_1M],
+             (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
+
+// Predicate reverse
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^REV_PP_[BHSD]")>;
+
+// Predicate select
+def : InstRW<[V3AEWrite_1c_1M], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PTRUES_[BHSD]")>;
+
+// Predicate find first/next
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+
+// Predicate test
+def : InstRW<[V3AEWrite_1c_1M], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^TRN[12]_PPP_[BHSD]")>;
+
+// Predicate unpack and widen
+def : InstRW<[V3AEWrite_2c_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[V3AEWrite_2c_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>;
+
+// §3.25 SVE integer instructions
+// -----------------------------------------------------------------------------
+
+// Arithmetic, absolute diff
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
+                                           "^[SU]ABD_ZPZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[V3AEWr_ZA, V3AERd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, basic
+def : InstRW<[V3AEWrite_2c_1V],
+             (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^(ADD|SUB)_ZZZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZI_[BHSD]",
+                        "^ADR_[SU]XTW_ZZZ_D_[0123]",
+                        "^ADR_LSL_ZZZ_[SD]_[0123]",
+                        "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+                        "^SADDLBT_ZZZ_[HSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
+
+// Arithmetic, complex
+def : InstRW<[V3AEWrite_2c_1V],
+             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
+                        "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
+                        "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
+                        "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, large integer
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[V3AEWr_ZPA, ReadDefault, V3AERd_ZPA],
+             (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
+
+// Arithmetic, shift
+def : InstRW<[V3AEWrite_2c_1V1],
+             (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
+                        "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
+                        "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZZI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]",
+                        "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[V3AEWr_ZSA, V3AERd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>;
+
+// Arithmetic, shift by immediate
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SHRN[BT]_ZZI_[BHS]",
+                                           "^[SU]SHLL[BT]_ZZI_[HSD]")>;
+
+// Arithmetic, shift by immediate and insert
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>;
+
+// Arithmetic, shift complex
+def : InstRW<[V3AEWrite_4c_1V],
+             (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
+                        "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]",
+                        "^[SU]QR?SHL_ZPZZ_[BHSD]",
+                        "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
+                        "^SQSHRU?N[BT]_ZZI_[BHS]",
+                        "^UQR?SHRN[BT]_ZZI_[BHS]")>;
+
+// Arithmetic, shift right for divide
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]",
+                                           "^[SU]RSHL_ZPZZ_[BHSD]",
+                                           "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>;
+
+// Bit manipulation
+def : InstRW<[V3AEWrite_6c_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>;
+
+// Bitwise select
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
+
+// Count/reverse bits
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>;
+
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[V3AEWrite_2c_1V], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[V3AEWrite_2or3c_1V0],
+             (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
+                        "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
+
+// Complex add
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[V3AEWr_ZCMABHS, V3AERd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]",
+                                                          "^CMLA_ZZZI_[HS]")>;
+
+// Complex multiply-add D element size
+def : InstRW<[V3AEWr_ZCMAD, V3AERd_ZCMAD], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[V3AEWrite_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
+                                            "^COMPACT_ZPZ_[SD]",
+                                            "^SPLICE_ZPZZ?_[BHSD]")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]",
+                                            "^[SU]CVTF_ZPmZ_StoD")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
+
+// Copy, scalar
+def : InstRW<[V3AEWrite_5c_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^CPY_ZPm[IV]_[BHSD]",
+                                           "^CPY_ZPzI_[BHSD]")>;
+
+// Divides, 32 bit
+def : InstRW<[V3AEWrite_12c_1V0], (instregex "^[SU]DIVR?_ZPmZ_S",
+                                             "^[SU]DIV_ZPZZ_S")>;
+
+// Divides, 64 bit
+def : InstRW<[V3AEWrite_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
+                                             "^[SU]DIV_ZPZZ_D")>;
+
+// Dot product, 8 bit
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[V3AEWr_ZDOTB, V3AERd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[V3AEWr_ZDOTH, V3AERd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^DUP_ZI_[BHSD]",
+                                           "^DUP_ZZI_[BHSDQ]")>;
+
+// Duplicate, scalar form
+def : InstRW<[V3AEWrite_3c_1M0], (instregex "^DUP_ZR_[BHSD]")>;
+
+// Extend, sign or zero
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]",
+                                           "^[SU]XTH_ZPmZ_[SD]",
+                                           "^[SU]XTW_ZPmZ_[D]")>;
+
+// Extract
+def : InstRW<[V3AEWrite_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
+                                           "^SQXTUN[BT]_ZZ_[BHS]")>;
+
+// Extract operation, SIMD and FP scalar form
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]")>;
+
+// Extract operation, scalar
+def : InstRW<[V3AEWrite_6c_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]")>;
+
+// Histogram operations
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^HISTCNT_ZPzZZ_[SD]",
+                                           "^HISTSEG_ZZZ")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^INDEX_II_[BHS]")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[V3AEWrite_7c_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[V3AEWrite_5c_2V0], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[V3AEWrite_8c_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D")>;
+
+// insert operation, SIMD and FP scalar form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^INSR_ZV_[BHSD]")>;
+
+// insert operation, scalar
+def : InstRW<[V3AEWrite_5c_1V1_1M0], (instregex "^INSR_ZR_[BHSD]")>;
+
+// Logical
+def : InstRW<[V3AEWrite_2c_1V],
+             (instregex "^(AND|EOR|ORR)_ZI",
+                        "^(AND|BIC|EOR|ORR)_ZZZ",
+                        "^EOR(BT|TB)_ZZZ_[BHSD]",
+                        "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]",
+                        "^NOT_ZPmZ_[BHSD]")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
+                                           "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]",
+                                           "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>;
+
+// Matching operations
+// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the
+// latency for this instruction is 4 cycles.
+def : InstRW<[V3AEWrite_2or3c_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[V3AEWr_ZMMA, V3AERd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
+                                           "^MOVPRFX_ZZ")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
+                                            "^MUL_ZPZZ_[BHS]",
+                                            "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]",
+                                            "^[SU]MULH_ZPZZ_[BHS]")>;
+
+// Multiply, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
+                                           "^MUL_ZPZZ_D",
+                                           "^[SU]MULH_(ZPmZ|ZZZ)_D",
+                                           "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
+                                            "^[SU]MULL[BT]_ZZZ_[HSD]")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[V3AEWr_ZMABHS, V3AERd_ZMABHS],
+             (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>;
+def : InstRW<[V3AEWr_ZMABHS, ReadDefault, V3AERd_ZMABHS],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V3AEWr_ZMAD, V3AERd_ZMAD],
+             (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V3AEWr_ZMAD, ReadDefault, V3AERd_ZMAD],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
+
+// Multiply accumulate long
+def : InstRW<[V3AEWr_ZMAL, V3AERd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
+                                                    "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[V3AEWr_ZMASQL, V3AERd_ZMASQ],
+             (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]",
+                        "^SQDML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]",
+                                            "^SQDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
+                                            "^SQDMULL[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[V3AEWr_ZMASQBHS, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
+                                                         "^SQRDCMLAH_ZZZ_[BHS]",
+                                                         "^SQRDML[AS]H_ZZZI_[HS]",
+                                                         "^SQRDCMLAH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[V3AEWr_ZMASQD, V3AERd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D",
+                                                   "^SQRDCMLAH_ZZZ_D")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]",
+                                            "^SQRDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[V3AEWrite_5c_2V0], (instregex "^SQRDMULH_ZZZI?_D")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^PMUL_ZZZ_B",
+                                           "^PMULL[BT]_ZZZ_[HDQ]")>;
+
+// Predicate counting vector
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>;
+
+// Reciprocal estimate
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[V3AEWrite_9c_2V_4V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[V3AEWrite_8c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[V3AEWrite_6c_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[V3AEWrite_4c_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[V3AEWrite_6c_1V_1V1], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>;
+
+// Reverse, vector
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^REV_ZZ_[BHSD]",
+                                           "^REVB_ZPmZ_[HSD]",
+                                           "^REVH_ZPmZ_[SD]",
+                                           "^REVW_ZPmZ_D")>;
+
+// Select, vector form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^SEL_ZPZZ_[BHSD]")>;
+
+// Table lookup
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>;
+
+// Table lookup extension
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TBX_ZZZ_[BHSD]")>;
+
+// Transpose, vector form
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+
+// Unpack and extend
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
+
+// Zip/unzip
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+
+// §3.26 SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
+                                           "^FABD_ZPZZ_[HSD]",
+                                           "^FABS_ZPmZ_[HSD]")>;
+
+// Floating point arithmetic
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
+                                           "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
+                                           "^FADDP_ZPmZZ_[HSD]",
+                                           "^FNEG_ZPmZ_[HSD]",
+                                           "^FSUBR_ZPm[IZ]_[HSD]",
+                                           "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
+
+// Floating point associative add, F16
+def : InstRW<[V3AEWrite_10c_1V1_9rc], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[V3AEWrite_6c_1V1_5rc], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[V3AEWrite_4c_1V], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[V3AEWrite_2c_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+                                            "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
+                                            "^FCM(LE|LT)_PPzZ0_[HSD]",
+                                            "^FCMUO_PPzZZ_[HSD]")>;
+
+// Floating point complex add
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]")>;
+
+// Floating point complex multiply add
+def : InstRW<[V3AEWr_ZFCMA, ReadDefault, V3AERd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V3AEWr_ZFCMA, V3AERd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
+                                            "^FCVTLT_ZPmZ_HtoS",
+                                            "^FCVTNT_ZPmZ_StoH")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
+                                            "^FCVTLT_ZPmZ_StoD",
+                                            "^FCVTNT_ZPmZ_DtoS")>;
+
+// Floating point convert, round to odd
+def : InstRW<[V3AEWrite_3c_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
+
+// Floating point base2 log, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point base2 log, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point base2 log, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point convert to integer, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[V3AEWrite_3c_1V0],
+             (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
+
+// Floating point copy
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^FCPY_ZPmI_[HSD]",
+                                           "^FDUP_ZI_[HSD]")>;
+
+// Floating point divide, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point divide, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point divide, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point min/max pairwise
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+
+// Floating point min/max
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
+                                           "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
+                                           "^FMULX_ZPZZ_[HSD]",
+                                           "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]",
+                                           "^FMUL_ZPZ[IZ]_[HSD]")>;
+
+// Floating point multiply accumulate
+def : InstRW<[V3AEWr_ZFMA, ReadDefault, V3AERd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V3AEWr_ZFMA, V3AERd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[V3AEWr_ZFMAL, V3AERd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>;
+
+// Floating point reciprocal estimate, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>;
+
+// Floating point reciprocal step
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
+
+// Floating point reduction, F16
+def : InstRW<[V3AEWrite_8c_4V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>;
+
+// Floating point reduction, F32
+def : InstRW<[V3AEWrite_6c_3V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>;
+
+// Floating point reduction, F64
+def : InstRW<[V3AEWrite_4c_2V],
+             (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>;
+
+// Floating point round to integral, F16
+def : InstRW<[V3AEWrite_6c_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
+
+// Floating point round to integral, F32
+def : InstRW<[V3AEWrite_4c_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
+
+// Floating point round to integral, F64
+def : InstRW<[V3AEWrite_3c_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
+
+// Floating point square root, F16
+def : InstRW<[V3AEWrite_13c_1V1_8rc], (instregex "^FSQRT_ZPmZ_H")>;
+
+// Floating point square root, F32
+def : InstRW<[V3AEWrite_11c_1V1_4rc], (instregex "^FSQRT_ZPmZ_S")>;
+
+// Floating point square root, F64
+def : InstRW<[V3AEWrite_14c_1V1_2rc], (instregex "^FSQRT_ZPmZ_D")>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[V3AEWrite_3c_1V1], (instregex "^FEXPA_ZZ_[HSD]")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[V3AEWrite_4c_1V], (instregex "^FTMAD_ZZI_[HSD]")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[V3AEWrite_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>;
+
+// §3.27 SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[V3AEWrite_4c_1V], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[V3AEWr_ZBFDOT, V3AERd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[V3AEWr_ZBFMMA, V3AERd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
+
+// Multiply accumulate long
+def : InstRW<[V3AEWr_ZBFMAL, V3AERd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>;
+
+// §3.28 SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[V3AEWrite_6c_1L], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[V3AEWrite_6c_1L_1M], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]_IMM$",
+                                           "^LD1S?B_[HSD]_IMM$",
+                                           "^LD1S?H_[SD]_IMM$",
+                                           "^LD1S?W_D_IMM$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1[BHWD]$",
+                                           "^LD1S?B_[HSD]$",
+                                           "^LD1S?H_[SD]$",
+                                           "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1R[BHWD]_IMM$",
+                                           "^LD1RS?B_[HSD]_IMM$",
+                                           "^LD1RS?H_[SD]_IMM$",
+                                           "^LD1RW_D_IMM$",
+                                           "^LD1RSW_IMM$",
+                                           "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+// Non temporal load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[V3AEWrite_9c_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$",
+                                              "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[V3AEWrite_9c_2L_2V], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[V3AEWrite_9c_2L_2V], (instrs LDNT1D_ZZR_D)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[V3AEWrite_6c_1L_1I], (instregex "^LDFF1[BHWD]$",
+                                              "^LDFF1S?B_[HSD]$",
+                                              "^LDFF1S?H_[SD]$",
+                                              "^LDFF1S?W_D$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[V3AEWrite_6c_1L], (instregex "^LDNF1[BHWD]_IMM$",
+                                           "^LDNF1S?B_[HSD]_IMM$",
+                                           "^LDNF1S?H_[SD]_IMM$",
+                                           "^LDNF1S?W_D_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[V3AEWrite_8c_2L_2V], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[V3AEWrite_9c_2L_2V_2I], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[V3AEWrite_9c_3L_3V], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[V3AEWrite_10c_3V_3L_3I], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[V3AEWrite_9c_4L_8V], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[V3AEWrite_10c_4L_8V_4I], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
+                                              "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+                                              "^GLD(FF)?1D_IMM$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[V3AEWrite_10c_1L_8V],
+             (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$",
+                        "^GLD(FF)?1W_[SU]XTW_SCALED")>;
+
+// Gather load, 64-bit scaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3AEWrite_10c_1L_4V],
+             (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$",
+                        "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[V3AEWrite_9c_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+                                            "^GLD(FF)?1W_[SU]XTW$")>;
+
+// Gather load, 64-bit unpacked unscaled offset
+// NOTE: These instructions are not specified in the SOG.
+def : InstRW<[V3AEWrite_9c_1L_2V],
+             (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$",
+                        "^GLD(FF)?1D(_[SU]XTW)?$")>;
+
+// §3.29 SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[V3AEWrite_1c_1SA], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BHWD]_IMM$",
+                                               "^ST1B_[HSD]_IMM$",
+                                               "^ST1H_[SD]_IMM$",
+                                               "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^ST1[BWD]$",
+                                               "^ST1B_[HSD]$",
+                                               "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[V3AEWrite_4c_1SA_1V], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[V3AEWrite_4c_2SA_2I_2V], (instrs ST2H)>;
+def : InstRW<[V3AEWrite_4c_2SA_2V], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[V3AEWrite_7c_9SA_9V], (instregex "^ST3[BHWD]_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[V3AEWrite_7c_9SA_9I_9V], (instregex "^ST3[BHWD]$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[V3AEWrite_11c_18SA_18V], (instregex "^ST4[BHWD]_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[V3AEWrite_11c_18SA_18I_18V], (instregex "^ST4[BHWD]$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[V3AEWrite_2c_1SA_1I_1V], (instrs STNT1H_ZRR)>;
+def : InstRW<[V3AEWrite_2c_1SA_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_IMM$",
+                                               "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_IMM$",
+                                               "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[V3AEWrite_4c_4SA_4V],
+             (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D_[SU]XTW$",
+                                               "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+                                               "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[V3AEWrite_4c_4SA_4V], (instregex "^SST1[BH]_S_[SU]XTW$",
+                                               "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[HW]_D_SCALED$",
+                                               "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[V3AEWrite_2c_2SA_2V], (instregex "^SST1[BHW]_D$",
+                                               "^SST1D$")>;
+
+// §3.30 SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[V3AEWrite_2c_1M0], (instrs RDFFR_P)>;
+
+// Read first fault register, predicated
+def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFR_PPz)>;
+
+// Read first fault register and set flags
+def : InstRW<[V3AEWrite_3or4c_1M0_1M], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[V3AEWrite_2c_1M0], (instrs SETFFR, WRFFR)>;
+
+// Prefetch
+// NOTE: This is not specified in the SOG.
+def : InstRW<[V3AEWrite_4c_1L], (instregex "^PRF[BHWD]")>;
+
+// §3.31 SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^AES[DE]_ZZZ_B$",
+                                         "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[V3AEWrite_2c_1V], (instregex "^(BCAX|EOR3)_ZZZZ$",
+                                         "^RAX1_ZZZ_D$",
+                                         "^XAR_ZZZI_[BHSD]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[V3AEWrite_4c_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5b80b08..068954f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -764,8 +764,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
 }
 
 void AArch64PassConfig::addMachineSSAOptimization() {
-  if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
-    addPass(createMachineSMEABIPass());
+  if (TM->getOptLevel() != CodeGenOptLevel::None && EnableNewSMEABILowering)
+    addPass(createMachineSMEABIPass(TM->getOptLevel()));
 
   if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
     addPass(createSMEPeepholeOptPass());
@@ -798,7 +798,7 @@ bool AArch64PassConfig::addILPOpts() {
 
 void AArch64PassConfig::addPreRegAlloc() {
   if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
-    addPass(createMachineSMEABIPass());
+    addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
 
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 5c3e26e..4cd51d6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI,
   } else
     return false;
 
-  std::string Str;
   llvm::transform(Name, Name.begin(), ::tolower);
 
   O << '\t' << Ins << '\t' << Reg.str() << ", " << Name;
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 434ea67..7cb5003 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -121,8 +121,10 @@ struct InstInfo {
 /// Contains the needed ZA state for each instruction in a block. Instructions
 /// that do not require a ZA state are not recorded.
 struct BlockInfo {
-  ZAState FixedEntryState{ZAState::ANY};
   SmallVector<InstInfo> Insts;
+  ZAState FixedEntryState{ZAState::ANY};
+  ZAState DesiredIncomingState{ZAState::ANY};
+  ZAState DesiredOutgoingState{ZAState::ANY};
   LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
   LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
 };
@@ -175,10 +177,15 @@ private:
   Register AgnosticZABufferPtr = AArch64::NoRegister;
 };
 
+/// Checks if \p State is a legal edge bundle state. For a state to be a legal
+/// bundle state, it must be possible to transition from it to any other bundle
+/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED,
+/// as you can transition between those states by saving/restoring ZA. The OFF
+/// state would not be legal, as transitioning to it drops the content of ZA.
 static bool isLegalEdgeBundleZAState(ZAState State) {
   switch (State) {
-  case ZAState::ACTIVE:
-  case ZAState::LOCAL_SAVED:
+  case ZAState::ACTIVE:      // ZA state within the accumulator/ZT0.
+  case ZAState::LOCAL_SAVED: // ZA state is saved on the stack.
     return true;
   default:
     return false;
@@ -238,7 +245,8 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
 struct MachineSMEABI : public MachineFunctionPass {
   inline static char ID = 0;
 
-  MachineSMEABI() : MachineFunctionPass(ID) {}
+  MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+      : MachineFunctionPass(ID), OptLevel(OptLevel) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -267,6 +275,11 @@ struct MachineSMEABI : public MachineFunctionPass {
                           const EdgeBundles &Bundles,
                           ArrayRef<ZAState> BundleStates);
 
+  /// Propagates desired states forwards (from predecessors -> successors) if
+  /// \p Forwards, otherwise, propagates backwards (from successors ->
+  /// predecessors).
+  void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
+
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitNewZAPrologue(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI);
@@ -335,12 +348,15 @@ struct MachineSMEABI : public MachineFunctionPass {
                          MachineBasicBlock::iterator MBBI, DebugLoc DL);
 
 private:
+  CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
+
   MachineFunction *MF = nullptr;
   const AArch64Subtarget *Subtarget = nullptr;
   const AArch64RegisterInfo *TRI = nullptr;
   const AArch64FunctionInfo *AFI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  MachineLoopInfo *MLI = nullptr;
 };
 
 static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
@@ -422,12 +438,69 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
 
     // Reverse vector (as we had to iterate backwards for liveness).
     std::reverse(Block.Insts.begin(), Block.Insts.end());
+
+    // Record the desired states on entry/exit of this block. These are the
+    // states that would not incur a state transition.
+    if (!Block.Insts.empty()) {
+      Block.DesiredIncomingState = Block.Insts.front().NeededState;
+      Block.DesiredOutgoingState = Block.Insts.back().NeededState;
+    }
   }
 
   return FunctionInfo{std::move(Blocks), AfterSMEProloguePt,
                       PhysLiveRegsAfterSMEPrologue};
 }
 
+void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo,
+                                           bool Forwards) {
+  // If `Forwards`, this propagates desired states from predecessors to
+  // successors, otherwise, this propagates states from successors to
+  // predecessors.
+  auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
+    return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
+  };
+
+  SmallVector<MachineBasicBlock *> Worklist;
+  for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) {
+    if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
+      Worklist.push_back(MF->getBlockNumbered(BlockID));
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()];
+
+    // Pick a legal edge bundle state that matches the majority of
+    // predecessors/successors.
+    int StateCounts[ZAState::NUM_ZA_STATE] = {0};
+    for (MachineBasicBlock *PredOrSucc :
+         Forwards ? predecessors(MBB) : successors(MBB)) {
+      BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()];
+      ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards);
+      if (isLegalEdgeBundleZAState(ZAState))
+        StateCounts[ZAState]++;
+    }
+
+    ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
+    ZAState &CurrentState = GetBlockState(Block, Forwards);
+    if (PropagatedState != CurrentState) {
+      CurrentState = PropagatedState;
+      ZAState &OtherState = GetBlockState(Block, !Forwards);
+      // Propagate to the incoming/outgoing state if that is also "ANY".
+      if (OtherState == ZAState::ANY)
+        OtherState = PropagatedState;
+      // Push any successors/predecessors that may need updating to the
+      // worklist.
+      for (MachineBasicBlock *SuccOrPred :
+           Forwards ? successors(MBB) : predecessors(MBB)) {
+        BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()];
+        if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards)))
+          Worklist.push_back(SuccOrPred);
+      }
+    }
+  }
+}
+
 /// Assigns each edge bundle a ZA state based on the needed states of blocks
 /// that have incoming or outgoing edges in that bundle.
 SmallVector<ZAState>
@@ -440,40 +513,36 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
     // Attempt to assign a ZA state for this bundle that minimizes state
     // transitions. Edges within loops are given a higher weight as we assume
     // they will be executed more than once.
-    // TODO: We should propagate desired incoming/outgoing states through blocks
-    // that have the "ANY" state first to make better global decisions.
     int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
     for (unsigned BlockID : Bundles.getBlocks(I)) {
       LLVM_DEBUG(dbgs() << "- bb." << BlockID);
 
       const BlockInfo &Block = FnInfo.Blocks[BlockID];
-      if (Block.Insts.empty()) {
-        LLVM_DEBUG(dbgs() << " (no state preference)\n");
-        continue;
-      }
       bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
       bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
 
-      ZAState DesiredIncomingState = Block.Insts.front().NeededState;
-      if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
-        EdgeStateCounts[DesiredIncomingState]++;
+      bool LegalInEdge =
+          InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
+      bool LegalOutEgde =
+          OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
+      if (LegalInEdge) {
         LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
-                          << getZAStateString(DesiredIncomingState));
+                          << getZAStateString(Block.DesiredIncomingState));
+        EdgeStateCounts[Block.DesiredIncomingState]++;
       }
-      ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
-      if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
-        EdgeStateCounts[DesiredOutgoingState]++;
+      if (LegalOutEgde) {
         LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
-                          << getZAStateString(DesiredOutgoingState));
+                          << getZAStateString(Block.DesiredOutgoingState));
+        EdgeStateCounts[Block.DesiredOutgoingState]++;
       }
+      if (!LegalInEdge && !LegalOutEgde)
+        LLVM_DEBUG(dbgs() << " (no state preference)");
       LLVM_DEBUG(dbgs() << '\n');
     }
 
     ZAState BundleState =
         ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
 
-    // Force ZA to be active in bundles that don't have a preferred state.
-    // TODO: Something better here (to avoid extra mode switches).
     if (BundleState == ZAState::ANY)
       BundleState = ZAState::ACTIVE;
 
@@ -918,6 +987,43 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
       getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
 
   FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
+
+  if (OptLevel != CodeGenOptLevel::None) {
+    // Propagate desired states forward, then backwards. Most of the propagation
+    // should be done in the forward step, and backwards propagation is then
+    // used to fill in the gaps. Note: Doing both in one step can give poor
+    // results. For example, consider this subgraph:
+    //
+    //    ┌─────┐
+    //  ┌─┤ BB0 ◄───┐
+    //  │ └─┬───┘   │
+    //  │ ┌─▼───◄──┐│
+    //  │ │ BB1 │  ││
+    //  │ └─┬┬──┘  ││
+    //  │   │└─────┘│
+    //  │ ┌─▼───┐   │
+    //  │ │ BB2 ├───┘
+    //  │ └─┬───┘
+    //  │ ┌─▼───┐
+    //  └─► BB3 │
+    //    └─────┘
+    //
+    // If:
+    // - "BB0" and "BB2" (outer loop) has no state preference
+    // - "BB1" (inner loop) desires the ACTIVE state on entry/exit
+    // - "BB3" desires the LOCAL_SAVED state on entry
+    //
+    // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2,
+    // then from BB2 to BB0. Which results in the inner and outer loops having
+    // the "ACTIVE" state. This avoids any state changes in the loops.
+    //
+    // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from
+    // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED
+    // in the outer loop.
+    for (bool Forwards : {true, false})
+      propagateDesiredStates(FnInfo, Forwards);
+  }
+
   SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
 
   EmitContext Context;
@@ -941,4 +1047,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
+FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
+  return new MachineSMEABI(OptLevel);
+}