42 files changed, 1046 insertions, 973 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 570aae9..1f71d81 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -38,6 +38,14 @@ include "ARMSchedule.td"
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrInfo.td"
+
+def Thumb1OnlyMode : HwMode<[IsThumb1Only]>;
+def arm_ptr_rc : RegClassByHwMode<
+  [DefaultMode, Thumb1OnlyMode],
+  [GPR, tGPR]>;
+
+defm : RemapAllTargetPseudoPointerOperands<arm_ptr_rc>;
+
 def ARMInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 36b9908..458a3f0 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -51,8 +51,8 @@ using namespace llvm;
 
 ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
-    : AsmPrinter(TM, std::move(Streamer), ID), Subtarget(nullptr), AFI(nullptr),
-      MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
+    : AsmPrinter(TM, std::move(Streamer), ID), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false), OptimizationGoals(-1) {}
 
 const ARMBaseTargetMachine &ARMAsmPrinter::getTM() const {
   return static_cast<const ARMBaseTargetMachine &>(TM);
@@ -97,11 +97,41 @@ void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
 
   const MCExpr *E = MCSymbolRefExpr::create(
       GetARMGVSymbol(GV, ARMII::MO_NO_FLAG),
-      (Subtarget->isTargetELF() ? ARM::S_TARGET1 : ARM::S_None), OutContext);
+      (TM.getTargetTriple().isOSBinFormatELF() ? ARM::S_TARGET1 : ARM::S_None),
+      OutContext);
 
   OutStreamer->emitValue(E, Size);
 }
 
+// An alias to a cmse entry function should also emit a `__acle_se_` symbol.
+void ARMAsmPrinter::emitCMSEVeneerAlias(const GlobalAlias &GA) {
+  const Function *BaseFn = dyn_cast_or_null<Function>(GA.getAliaseeObject());
+  if (!BaseFn || !BaseFn->hasFnAttribute("cmse_nonsecure_entry"))
+    return;
+
+  MCSymbol *AliasSym = getSymbol(&GA);
+  MCSymbol *FnSym = getSymbol(BaseFn);
+
+  MCSymbol *SEAliasSym =
+      OutContext.getOrCreateSymbol(Twine("__acle_se_") + AliasSym->getName());
+  MCSymbol *SEBaseSym =
+      OutContext.getOrCreateSymbol(Twine("__acle_se_") + FnSym->getName());
+
+  // Mirror alias linkage/visibility onto the veneer-alias symbol.
+  emitLinkage(&GA, SEAliasSym);
+  OutStreamer->emitSymbolAttribute(SEAliasSym, MCSA_ELF_TypeFunction);
+  emitVisibility(SEAliasSym, GA.getVisibility());
+
+  // emit "__acle_se_<alias> = __acle_se_<aliasee>"
+  const MCExpr *SEExpr = MCSymbolRefExpr::create(SEBaseSym, OutContext);
+  OutStreamer->emitAssignment(SEAliasSym, SEExpr);
+}
+
+void ARMAsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
+  AsmPrinter::emitGlobalAlias(M, GA);
+  emitCMSEVeneerAlias(GA);
+}
+
 void ARMAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (PromotedGlobals.count(GV))
     // The global was promoted into a constant pool. It should not be emitted.
@@ -115,7 +145,6 @@ void ARMAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AFI = MF.getInfo<ARMFunctionInfo>();
   MCP = MF.getConstantPool();
-  Subtarget = &MF.getSubtarget<ARMSubtarget>();
 
   SetupMachineFunction(MF);
   const Function &F = MF.getFunction();
@@ -153,7 +182,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
     OptimizationGoals = 0;
 
-  if (Subtarget->isTargetCOFF()) {
+  if (TM.getTargetTriple().isOSBinFormatCOFF()) {
     bool Local = F.hasLocalLinkage();
     COFF::SymbolStorageClass Scl =
         Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL;
@@ -259,8 +288,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
+    assert(!MF->getSubtarget<ARMSubtarget>().genExecuteOnly() &&
+           "execute-only should not generate constant pools");
     GetCPISymbol(MO.getIndex())->print(O, MAI);
     break;
   }
@@ -595,8 +624,7 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   if (OptimizationGoals > 0 &&
-      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
-       Subtarget->isTargetMuslAEABI()))
+      (TT.isTargetAEABI() || TT.isTargetGNUAEABI() || TT.isTargetMuslAEABI()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
   OptimizationGoals = -1;
 
@@ -884,9 +912,10 @@ static uint8_t getModifierSpecifier(ARMCP::ARMCPModifier Modifier) {
 
 MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
                                         unsigned char TargetFlags) {
-  if (Subtarget->isTargetMachO()) {
+  const Triple &TT = TM.getTargetTriple();
+  if (TT.isOSBinFormatMachO()) {
     bool IsIndirect =
-        (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
+        (TargetFlags & ARMII::MO_NONLAZY) && getTM().isGVIndirectSymbol(GV);
 
     if (!IsIndirect)
       return getSymbol(GV);
@@ -903,9 +932,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
       StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
                                                    !GV->hasInternalLinkage());
     return MCSym;
-  } else if (Subtarget->isTargetCOFF()) {
-    assert(Subtarget->isTargetWindows() &&
-           "Windows is the only supported COFF target");
+  } else if (TT.isOSBinFormatCOFF()) {
+    assert(TT.isOSWindows() && "Windows is the only supported COFF target");
 
     bool IsIndirect =
         (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB));
@@ -932,7 +960,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
     }
 
     return MCSym;
-  } else if (Subtarget->isTargetELF()) {
+  } else if (TT.isOSBinFormatELF()) {
     return getSymbolPreferLocal(*GV);
   }
   llvm_unreachable("unexpected target");
@@ -978,7 +1006,8 @@ void ARMAsmPrinter::emitMachineConstantPoolValue(
 
     // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
     // flag the global as MO_NONLAZY.
-    unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+    unsigned char TF =
+        TM.getTargetTriple().isOSBinFormatMachO() ? ARMII::MO_NONLAZY : 0;
     MCSym = GetARMGVSymbol(GV, TF);
   } else if (ACPV->isMachineBasicBlock()) {
     const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
@@ -1047,7 +1076,8 @@ void ARMAsmPrinter::emitJumpTableAddrs(const MachineInstr *MI) {
     //    .word (LBB1 - LJTI_0_0)
     const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
 
-    if (isPositionIndependent() || Subtarget->isROPI())
+    const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+    if (isPositionIndependent() || STI.isROPI())
       Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
@@ -1096,7 +1126,8 @@ void ARMAsmPrinter::emitJumpTableTBInst(const MachineInstr *MI,
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
-  if (Subtarget->isThumb1Only())
+  const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+  if (STI.isThumb1Only())
     emitAlignment(Align(4));
 
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -1904,6 +1935,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   ARM_MC::verifyInstructionPredicates(MI->getOpcode(),
                                       getSubtargetInfo().getFeatureBits());
 
+  const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
   const DataLayout &DL = getDataLayout();
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1915,8 +1947,8 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   }
 
   // Emit unwinding stuff for frame-related instructions
-  if (Subtarget->isTargetEHABICompatible() &&
-       MI->getFlag(MachineInstr::FrameSetup))
+  if (TM.getTargetTriple().isTargetEHABICompatible() &&
+      MI->getFlag(MachineInstr::FrameSetup))
     EmitUnwindingInstruction(MI);
 
   // Do any auto-generated pseudo lowerings.
@@ -1982,14 +2014,13 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
       // Add 's' bit operand (always reg0 for this)
       .addReg(0));
 
-    assert(Subtarget->hasV4TOps());
-    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
-      .addReg(MI->getOperand(0).getReg()));
+    assert(STI.hasV4TOps() && "Expected V4TOps for BX call");
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(ARM::BX).addReg(MI->getOperand(0).getReg()));
     return;
   }
   case ARM::tBX_CALL: {
-    if (Subtarget->hasV5TOps())
-      llvm_unreachable("Expected BLX to be selected for v5t+");
+    assert(!STI.hasV5TOps() && "Expected BLX to be selected for v5t+");
 
     // On ARM v4t, when doing a call from thumb mode, we need to ensure
     // that the saved lr has its LSB set correctly (the arch doesn't
@@ -2278,8 +2309,8 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::CONSTPOOL_ENTRY: {
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
+    assert(!STI.genExecuteOnly() &&
+           "execute-only should not generate constant pools");
 
     /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
     /// in the function.  The first operand is the ID# for this instruction, the
@@ -2485,7 +2516,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case ARM::TRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
-    if (!Subtarget->isTargetMachO()) {
+    if (!TM.getTargetTriple().isOSBinFormatMachO()) {
       uint32_t Val = 0xe7ffdefeUL;
       OutStreamer->AddComment("trap");
       ATS.emitInst(Val);
@@ -2496,7 +2527,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
-    if (!Subtarget->isTargetMachO()) {
+    if (!TM.getTargetTriple().isOSBinFormatMachO()) {
       uint16_t Val = 0xdefe;
       OutStreamer->AddComment("trap");
       ATS.emitInst(Val, 'n');
@@ -2656,9 +2687,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    const MachineFunction &MF = *MI->getParent()->getParent();
-    const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-
     if (STI.isTargetDarwin() || STI.isTargetWindows()) {
       // These platforms always use the same frame register
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
@@ -2687,7 +2715,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
         .addReg(0));
     }
 
-    assert(Subtarget->hasV4TOps());
+    assert(STI.hasV4TOps());
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
       .addReg(ScratchReg)
       // Predicate.
@@ -2704,9 +2732,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     Register SrcReg = MI->getOperand(0).getReg();
     Register ScratchReg = MI->getOperand(1).getReg();
 
-    const MachineFunction &MF = *MI->getParent()->getParent();
-    const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 9e92b5a..12e20d7 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
 #define LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
 
-#include "ARMSubtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 class ARMFunctionInfo;
+class ARMBaseTargetMachine;
 class MCOperand;
 class MachineConstantPool;
 class MachineOperand;
@@ -33,10 +33,6 @@ public:
   static char ID;
 
 private:
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const ARMSubtarget *Subtarget;
-
   /// AFI - Keep a pointer to ARMFunctionInfo for the current
   /// MachineFunction.
   ARMFunctionInfo *AFI;
@@ -108,6 +104,7 @@ public:
   void emitEndOfAsmFile(Module &M) override;
   void emitXXStructor(const DataLayout &DL, const Constant *CV) override;
   void emitGlobalVariable(const GlobalVariable *GV) override;
+  void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
 
@@ -163,6 +160,8 @@ private:
 
   MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
 
+  void emitCMSEVeneerAlias(const GlobalAlias &GA);
+
 public:
   /// EmitMachineConstantPoolValue - Print a machine constantpool value to
   /// the .s file.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 22769db..02887ce 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = {
   { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
 };
 
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
-    : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI,
+                                   const ARMBaseRegisterInfo &TRI)
+    : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
       Subtarget(STI) {
   for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) {
     if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
@@ -928,15 +929,15 @@ ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI,
   return TargetInstrInfo::describeLoadedValue(MI, Reg);
 }
 
-const MachineInstrBuilder &
-ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                          unsigned SubIdx, unsigned State,
-                          const TargetRegisterInfo *TRI) const {
+const MachineInstrBuilder &ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB,
+                                                     unsigned Reg,
+                                                     unsigned SubIdx,
+                                                     unsigned State) const {
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
   if (Register::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+    return MIB.addReg(getRegisterInfo().getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
 
@@ -944,18 +945,18 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   Align Alignment = MFI.getObjectAlign(FI);
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  switch (TRI.getSpillSize(*RC)) {
     case 2:
       if (ARM::HPRRegClass.hasSubClassEq(RC)) {
         BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH))
@@ -1010,8 +1011,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
         if (Subtarget.hasV5TEOps()) {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
           MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
              .add(predOps(ARMCC::AL));
         } else {
@@ -1021,8 +1022,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .addMemOperand(MMO)
                                         .add(predOps(ARMCC::AL));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1072,9 +1073,9 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1104,10 +1105,10 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1124,14 +1125,14 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       .addFrameIndex(FI)
                                       .add(predOps(ARMCC::AL))
                                       .addMemOperand(MMO);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
-              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0);
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -1207,10 +1208,12 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
   return false;
 }
 
-void ARMBaseInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
@@ -1220,7 +1223,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
+  switch (TRI.getSpillSize(*RC)) {
   case 2:
     if (ARM::HPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg)
@@ -1271,8 +1275,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
 
       if (Subtarget.hasV5TEOps()) {
         MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
-        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
         MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
            .add(predOps(ARMCC::AL));
       } else {
@@ -1282,8 +1286,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                   .addFrameIndex(FI)
                   .addMemOperand(MMO)
                   .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
       }
 
       if (DestReg.isPhysical())
@@ -1329,9 +1333,9 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                       .addFrameIndex(FI)
                                       .addMemOperand(MMO)
                                       .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
         if (DestReg.isPhysical())
           MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
@@ -1358,10 +1362,10 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                        .addFrameIndex(FI)
                                        .add(predOps(ARMCC::AL))
                                        .addMemOperand(MMO);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
          if (DestReg.isPhysical())
            MIB.addReg(DestReg, RegState::ImplicitDefine);
        }
@@ -1379,14 +1383,14 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                     .addFrameIndex(FI)
                                     .add(predOps(ARMCC::AL))
                                     .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead);
       if (DestReg.isPhysical())
         MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
@@ -1652,8 +1656,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
 void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      Register DestReg, unsigned SubIdx,
-                                     const MachineInstr &Orig,
-                                     const TargetRegisterInfo &TRI) const {
+                                     const MachineInstr &Orig) const {
   unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
   default: {
@@ -6548,7 +6551,7 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
   static int constexpr LAST_IS_USE = MAX_STAGES;
   static int constexpr SEEN_AS_LIVE = MAX_STAGES + 1;
   typedef std::bitset<MAX_STAGES + 2> IterNeed;
-  typedef std::map<unsigned, IterNeed> IterNeeds;
+  typedef std::map<Register, IterNeed> IterNeeds;
 
   void bumpCrossIterationPressure(RegPressureTracker &RPT,
                                   const IterNeeds &CIN);
@@ -6622,14 +6625,14 @@ void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT,
   for (const auto &N : CIN) {
     int Cnt = N.second.count() - N.second[SEEN_AS_LIVE] * 2;
     for (int I = 0; I < Cnt; ++I)
-      RPT.increaseRegPressure(Register(N.first), LaneBitmask::getNone(),
+      RPT.increaseRegPressure(VirtRegOrUnit(N.first), LaneBitmask::getNone(),
                               LaneBitmask::getAll());
   }
   // Decrease pressure by the amounts in CrossIterationNeeds
   for (const auto &N : CIN) {
     int Cnt = N.second.count() - N.second[SEEN_AS_LIVE] * 2;
     for (int I = 0; I < Cnt; ++I)
-      RPT.decreaseRegPressure(Register(N.first), LaneBitmask::getAll(),
+      RPT.decreaseRegPressure(VirtRegOrUnit(N.first), LaneBitmask::getAll(),
                               LaneBitmask::getNone());
   }
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 2869e7f..04e2ab0 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
 protected:
   // Can be only subclassed.
-  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI,
+                            const ARMBaseRegisterInfo &TRI);
 
   void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                 unsigned LoadImmOpc, unsigned LoadOpc) const;
@@ -125,7 +126,11 @@ public:
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+  const ARMBaseRegisterInfo &getRegisterInfo() const {
+    return static_cast<const ARMBaseRegisterInfo &>(
+        TargetInstrInfo::getRegisterInfo());
+  }
+
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
@@ -211,14 +216,13 @@ public:
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
@@ -227,16 +231,14 @@ public:
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   MachineInstr &
   duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
             const MachineInstr &Orig) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                                     unsigned SubIdx, unsigned State,
-                                     const TargetRegisterInfo *TRI) const;
+                                     unsigned SubIdx, unsigned State) const;
 
   bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
                         const MachineRegisterInfo *MRI) const override;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ce1cdb3..80921ce 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -708,7 +708,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0));
 
   MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -881,8 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
 
   const MCInstrDesc &MCID = MI.getDesc();
-  const TargetRegisterClass *RegClass =
-      TII.getRegClass(MCID, FIOperandNum, this);
+  const TargetRegisterClass *RegClass = TII.getRegClass(MCID, FIOperandNum);
 
   if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg)))
     // Must be addrmode4/6.
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index f43ec73..80494d9 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -51,7 +51,6 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fffb6373..d69c09f 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -932,6 +932,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
     return true;
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut:
+  case MachineOperand::MO_LaneMask:
     return false;
   case MachineOperand::MO_Metadata:
   case MachineOperand::MO_MCSymbol:
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 138981a..c19eed1 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2342,7 +2342,6 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned Limit = (1 << 12) - 1;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
@@ -2364,7 +2363,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           break;
 
         const MCInstrDesc &MCID = MI.getDesc();
-        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI);
+        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i);
         if (RegClass && !RegClass->contains(ARM::SP))
           HasNonSPFrameIndex = true;
 
@@ -2537,7 +2536,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   (void)TRI;  // Silence unused warning in non-assert builds.
-  Register FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = STI.getFramePointerReg();
   ARMSubtarget::PushPopSplitVariation PushPopSplit =
       STI.getPushPopSplitVariation(MF);
 
@@ -2784,7 +2783,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
-    if (HasFP) {
+    // Save the FP if:
+    // 1. We currently need it (HasFP), OR
+    // 2. We might need it later due to stack realignment from aligned DPRCS2
+    //    saves (which will make hasFP() become true in emitPrologue).
+    if (HasFP || (isFPReserved(MF) && AFI->getNumAlignedDPRCS2Regs() > 0)) {
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 847b7af..26b5e5a 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3965,31 +3965,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     // Other cases are autogenerated.
     break;
-  case ARMISD::WLSSETUP: {
-    SDNode *New = CurDAG->getMachineNode(ARM::t2WhileLoopSetup, dl, MVT::i32,
-                                         N->getOperand(0));
-    ReplaceUses(N, New);
-    CurDAG->RemoveDeadNode(N);
-    return;
-  }
-  case ARMISD::WLS: {
-    SDNode *New = CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other,
-                                         N->getOperand(1), N->getOperand(2),
-                                         N->getOperand(0));
-    ReplaceUses(N, New);
-    CurDAG->RemoveDeadNode(N);
-    return;
-  }
-  case ARMISD::LE: {
-    SDValue Ops[] = { N->getOperand(1),
-                      N->getOperand(2),
-                      N->getOperand(0) };
-    unsigned Opc = ARM::t2LoopEnd;
-    SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-    ReplaceUses(N, New);
-    CurDAG->RemoveDeadNode(N);
-    return;
-  }
   case ARMISD::LDRD: {
     if (Subtarget->isThumb2())
       break; // TableGen handles isel in this case.
@@ -4043,17 +4018,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     CurDAG->RemoveDeadNode(N);
     return;
   }
-  case ARMISD::LOOP_DEC: {
-    SDValue Ops[] = { N->getOperand(1),
-                      N->getOperand(2),
-                      N->getOperand(0) };
-    SDNode *Dec =
-      CurDAG->getMachineNode(ARM::t2LoopDec, dl,
-                             CurDAG->getVTList(MVT::i32, MVT::Other), Ops);
-    ReplaceUses(N, Dec);
-    CurDAG->RemoveDeadNode(N);
-    return;
-  }
   case ARMISD::BRCOND: {
     // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
     // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 92fae71..2d26c67 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -508,7 +508,7 @@ const ARMBaseTargetMachine &ARMTargetLowering::getTM() const {
 
 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
                                      const ARMSubtarget &STI)
-    : TargetLowering(TM_), Subtarget(&STI),
+    : TargetLowering(TM_, STI), Subtarget(&STI),
       RegInfo(Subtarget->getRegisterInfo()),
       Itins(Subtarget->getInstrItineraryData()) {
   const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
@@ -518,74 +518,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   const Triple &TT = TM.getTargetTriple();
 
-  if (TT.isOSBinFormatMachO()) {
-    // Uses VFP for Thumb libfuncs if available.
-    if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
-        Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
-      // clang-format off
-      static const struct {
-        const RTLIB::Libcall Op;
-        const RTLIB::LibcallImpl Impl;
-      } LibraryCalls[] = {
-        // Single-precision floating-point arithmetic.
-        { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
-        { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
-        { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
-        { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
-
-        // Double-precision floating-point arithmetic.
-        { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
-        { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
-        { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
-        { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
-
-        // Single-precision comparisons.
-        { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
-        { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
-        { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
-        { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
-        { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
-        { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
-        { RTLIB::UO_F32,  RTLIB::impl___unordsf2vfp },
-
-        // Double-precision comparisons.
-        { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
-        { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
-        { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
-        { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
-        { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
-        { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
-        { RTLIB::UO_F64,  RTLIB::impl___unorddf2vfp },
-
-        // Floating-point to integer conversions.
-        // i64 conversions are done via library routines even when generating VFP
-        // instructions, so use the same ones.
-        { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
-        { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
-        { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
-        { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
-
-        // Conversions between floating types.
-        { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
-        { RTLIB::FPEXT_F32_F64,   RTLIB::impl___extendsfdf2vfp },
-
-        // Integer to floating-point conversions.
-        // i64 conversions are done via library routines even when generating VFP
-        // instructions, so use the same ones.
-        // FIXME: There appears to be some naming inconsistency in ARM libgcc:
-        // e.g., __floatunsidf vs. __floatunssidfvfp.
-        { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
-        { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
-        { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
-        { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
-      };
-      // clang-format on
-
-      for (const auto &LC : LibraryCalls)
-        setLibcallImpl(LC.Op, LC.Impl);
-    }
-  }
-
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -614,16 +546,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
                       ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
         setOperationAction(Op, MVT::f64, Legal);
+
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     }
   }
 
   if (Subtarget->hasFullFP16()) {
+    for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                    ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+      setOperationAction(Op, MVT::f16, Legal);
+
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Legal);
   }
 
   if (Subtarget->hasBF16()) {
@@ -933,13 +873,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,   MVT::f32, Custom);
   }
 
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+
   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
@@ -947,11 +888,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
     }
+  } else {
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
   }
 
   if (!Subtarget->hasFP16()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1291,16 +1237,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
     }
 
     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
     }
 
     // Strict floating-point comparisons need custom lowering.
@@ -1316,34 +1262,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
-  if (Subtarget->hasFPARMv8Base()) {
-    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::f32, Legal);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  if (Subtarget->hasFPARMv8Base()) {    
+    for (auto Op :
+         {ISD::FFLOOR,            ISD::FCEIL,             ISD::FROUND,
+          ISD::FTRUNC,            ISD::FNEARBYINT,        ISD::FRINT,
+          ISD::FROUNDEVEN,        ISD::FMINNUM,           ISD::FMAXNUM,
+          ISD::STRICT_FFLOOR,     ISD::STRICT_FCEIL,      ISD::STRICT_FROUND,
+          ISD::STRICT_FTRUNC,     ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT,
+          ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM,    ISD::STRICT_FMAXNUM}) {
+      setOperationAction(Op, MVT::f32, Legal);
+
+      if (Subtarget->hasFP64())
+        setOperationAction(Op, MVT::f64, Legal);
+    }
+
     if (Subtarget->hasNEON()) {
       setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
       setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
     }
-
-    if (Subtarget->hasFP64()) {
-      setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-      setOperationAction(ISD::FROUND, MVT::f64, Legal);
-      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-      setOperationAction(ISD::FRINT, MVT::f64, Legal);
-      setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
-      setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-      setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-    }
   }
 
   // FP16 often need to be promoted to call lib functions
@@ -1498,6 +1436,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
 
   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
+
+  IsStrictFPEnabled = true;
 }
 
 bool ARMTargetLowering::useSoftFloat() const {
@@ -1556,220 +1496,6 @@ ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   return std::make_pair(RRC, Cost);
 }
 
-const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define MAKE_CASE(V)                                                           \
-  case V:                                                                      \
-    return #V;
-  switch ((ARMISD::NodeType)Opcode) {
-  case ARMISD::FIRST_NUMBER:
-    break;
-    MAKE_CASE(ARMISD::Wrapper)
-    MAKE_CASE(ARMISD::WrapperPIC)
-    MAKE_CASE(ARMISD::WrapperJT)
-    MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
-    MAKE_CASE(ARMISD::CALL)
-    MAKE_CASE(ARMISD::CALL_PRED)
-    MAKE_CASE(ARMISD::CALL_NOLINK)
-    MAKE_CASE(ARMISD::tSECALL)
-    MAKE_CASE(ARMISD::t2CALL_BTI)
-    MAKE_CASE(ARMISD::BRCOND)
-    MAKE_CASE(ARMISD::BR_JT)
-    MAKE_CASE(ARMISD::BR2_JT)
-    MAKE_CASE(ARMISD::RET_GLUE)
-    MAKE_CASE(ARMISD::SERET_GLUE)
-    MAKE_CASE(ARMISD::INTRET_GLUE)
-    MAKE_CASE(ARMISD::PIC_ADD)
-    MAKE_CASE(ARMISD::CMP)
-    MAKE_CASE(ARMISD::CMN)
-    MAKE_CASE(ARMISD::CMPZ)
-    MAKE_CASE(ARMISD::CMPFP)
-    MAKE_CASE(ARMISD::CMPFPE)
-    MAKE_CASE(ARMISD::CMPFPw0)
-    MAKE_CASE(ARMISD::CMPFPEw0)
-    MAKE_CASE(ARMISD::BCC_i64)
-    MAKE_CASE(ARMISD::FMSTAT)
-    MAKE_CASE(ARMISD::CMOV)
-    MAKE_CASE(ARMISD::SSAT)
-    MAKE_CASE(ARMISD::USAT)
-    MAKE_CASE(ARMISD::ASRL)
-    MAKE_CASE(ARMISD::LSRL)
-    MAKE_CASE(ARMISD::LSLL)
-    MAKE_CASE(ARMISD::LSLS)
-    MAKE_CASE(ARMISD::LSRS1)
-    MAKE_CASE(ARMISD::ASRS1)
-    MAKE_CASE(ARMISD::RRX)
-    MAKE_CASE(ARMISD::ADDC)
-    MAKE_CASE(ARMISD::ADDE)
-    MAKE_CASE(ARMISD::SUBC)
-    MAKE_CASE(ARMISD::SUBE)
-    MAKE_CASE(ARMISD::VMOVRRD)
-    MAKE_CASE(ARMISD::VMOVDRR)
-    MAKE_CASE(ARMISD::VMOVhr)
-    MAKE_CASE(ARMISD::VMOVrh)
-    MAKE_CASE(ARMISD::VMOVSR)
-    MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
-    MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
-    MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
-    MAKE_CASE(ARMISD::TC_RETURN)
-    MAKE_CASE(ARMISD::THREAD_POINTER)
-    MAKE_CASE(ARMISD::DYN_ALLOC)
-    MAKE_CASE(ARMISD::MEMBARRIER_MCR)
-    MAKE_CASE(ARMISD::PRELOAD)
-    MAKE_CASE(ARMISD::LDRD)
-    MAKE_CASE(ARMISD::STRD)
-    MAKE_CASE(ARMISD::WIN__CHKSTK)
-    MAKE_CASE(ARMISD::WIN__DBZCHK)
-    MAKE_CASE(ARMISD::PREDICATE_CAST)
-    MAKE_CASE(ARMISD::VECTOR_REG_CAST)
-    MAKE_CASE(ARMISD::MVESEXT)
-    MAKE_CASE(ARMISD::MVEZEXT)
-    MAKE_CASE(ARMISD::MVETRUNC)
-    MAKE_CASE(ARMISD::VCMP)
-    MAKE_CASE(ARMISD::VCMPZ)
-    MAKE_CASE(ARMISD::VTST)
-    MAKE_CASE(ARMISD::VSHLs)
-    MAKE_CASE(ARMISD::VSHLu)
-    MAKE_CASE(ARMISD::VSHLIMM)
-    MAKE_CASE(ARMISD::VSHRsIMM)
-    MAKE_CASE(ARMISD::VSHRuIMM)
-    MAKE_CASE(ARMISD::VRSHRsIMM)
-    MAKE_CASE(ARMISD::VRSHRuIMM)
-    MAKE_CASE(ARMISD::VRSHRNIMM)
-    MAKE_CASE(ARMISD::VQSHLsIMM)
-    MAKE_CASE(ARMISD::VQSHLuIMM)
-    MAKE_CASE(ARMISD::VQSHLsuIMM)
-    MAKE_CASE(ARMISD::VQSHRNsIMM)
-    MAKE_CASE(ARMISD::VQSHRNuIMM)
-    MAKE_CASE(ARMISD::VQSHRNsuIMM)
-    MAKE_CASE(ARMISD::VQRSHRNsIMM)
-    MAKE_CASE(ARMISD::VQRSHRNuIMM)
-    MAKE_CASE(ARMISD::VQRSHRNsuIMM)
-    MAKE_CASE(ARMISD::VSLIIMM)
-    MAKE_CASE(ARMISD::VSRIIMM)
-    MAKE_CASE(ARMISD::VGETLANEu)
-    MAKE_CASE(ARMISD::VGETLANEs)
-    MAKE_CASE(ARMISD::VMOVIMM)
-    MAKE_CASE(ARMISD::VMVNIMM)
-    MAKE_CASE(ARMISD::VMOVFPIMM)
-    MAKE_CASE(ARMISD::VDUP)
-    MAKE_CASE(ARMISD::VDUPLANE)
-    MAKE_CASE(ARMISD::VEXT)
-    MAKE_CASE(ARMISD::VREV64)
-    MAKE_CASE(ARMISD::VREV32)
-    MAKE_CASE(ARMISD::VREV16)
-    MAKE_CASE(ARMISD::VZIP)
-    MAKE_CASE(ARMISD::VUZP)
-    MAKE_CASE(ARMISD::VTRN)
-    MAKE_CASE(ARMISD::VTBL1)
-    MAKE_CASE(ARMISD::VTBL2)
-    MAKE_CASE(ARMISD::VMOVN)
-    MAKE_CASE(ARMISD::VQMOVNs)
-    MAKE_CASE(ARMISD::VQMOVNu)
-    MAKE_CASE(ARMISD::VCVTN)
-    MAKE_CASE(ARMISD::VCVTL)
-    MAKE_CASE(ARMISD::VIDUP)
-    MAKE_CASE(ARMISD::VMULLs)
-    MAKE_CASE(ARMISD::VMULLu)
-    MAKE_CASE(ARMISD::VQDMULH)
-    MAKE_CASE(ARMISD::VADDVs)
-    MAKE_CASE(ARMISD::VADDVu)
-    MAKE_CASE(ARMISD::VADDVps)
-    MAKE_CASE(ARMISD::VADDVpu)
-    MAKE_CASE(ARMISD::VADDLVs)
-    MAKE_CASE(ARMISD::VADDLVu)
-    MAKE_CASE(ARMISD::VADDLVAs)
-    MAKE_CASE(ARMISD::VADDLVAu)
-    MAKE_CASE(ARMISD::VADDLVps)
-    MAKE_CASE(ARMISD::VADDLVpu)
-    MAKE_CASE(ARMISD::VADDLVAps)
-    MAKE_CASE(ARMISD::VADDLVApu)
-    MAKE_CASE(ARMISD::VMLAVs)
-    MAKE_CASE(ARMISD::VMLAVu)
-    MAKE_CASE(ARMISD::VMLAVps)
-    MAKE_CASE(ARMISD::VMLAVpu)
-    MAKE_CASE(ARMISD::VMLALVs)
-    MAKE_CASE(ARMISD::VMLALVu)
-    MAKE_CASE(ARMISD::VMLALVps)
-    MAKE_CASE(ARMISD::VMLALVpu)
-    MAKE_CASE(ARMISD::VMLALVAs)
-    MAKE_CASE(ARMISD::VMLALVAu)
-    MAKE_CASE(ARMISD::VMLALVAps)
-    MAKE_CASE(ARMISD::VMLALVApu)
-    MAKE_CASE(ARMISD::VMINVu)
-    MAKE_CASE(ARMISD::VMINVs)
-    MAKE_CASE(ARMISD::VMAXVu)
-    MAKE_CASE(ARMISD::VMAXVs)
-    MAKE_CASE(ARMISD::UMAAL)
-    MAKE_CASE(ARMISD::UMLAL)
-    MAKE_CASE(ARMISD::SMLAL)
-    MAKE_CASE(ARMISD::SMLALBB)
-    MAKE_CASE(ARMISD::SMLALBT)
-    MAKE_CASE(ARMISD::SMLALTB)
-    MAKE_CASE(ARMISD::SMLALTT)
-    MAKE_CASE(ARMISD::SMULWB)
-    MAKE_CASE(ARMISD::SMULWT)
-    MAKE_CASE(ARMISD::SMLALD)
-    MAKE_CASE(ARMISD::SMLALDX)
-    MAKE_CASE(ARMISD::SMLSLD)
-    MAKE_CASE(ARMISD::SMLSLDX)
-    MAKE_CASE(ARMISD::SMMLAR)
-    MAKE_CASE(ARMISD::SMMLSR)
-    MAKE_CASE(ARMISD::QADD16b)
-    MAKE_CASE(ARMISD::QSUB16b)
-    MAKE_CASE(ARMISD::QADD8b)
-    MAKE_CASE(ARMISD::QSUB8b)
-    MAKE_CASE(ARMISD::UQADD16b)
-    MAKE_CASE(ARMISD::UQSUB16b)
-    MAKE_CASE(ARMISD::UQADD8b)
-    MAKE_CASE(ARMISD::UQSUB8b)
-    MAKE_CASE(ARMISD::BUILD_VECTOR)
-    MAKE_CASE(ARMISD::BFI)
-    MAKE_CASE(ARMISD::VORRIMM)
-    MAKE_CASE(ARMISD::VBICIMM)
-    MAKE_CASE(ARMISD::VBSP)
-    MAKE_CASE(ARMISD::MEMCPY)
-    MAKE_CASE(ARMISD::VLD1DUP)
-    MAKE_CASE(ARMISD::VLD2DUP)
-    MAKE_CASE(ARMISD::VLD3DUP)
-    MAKE_CASE(ARMISD::VLD4DUP)
-    MAKE_CASE(ARMISD::VLD1_UPD)
-    MAKE_CASE(ARMISD::VLD2_UPD)
-    MAKE_CASE(ARMISD::VLD3_UPD)
-    MAKE_CASE(ARMISD::VLD4_UPD)
-    MAKE_CASE(ARMISD::VLD1x2_UPD)
-    MAKE_CASE(ARMISD::VLD1x3_UPD)
-    MAKE_CASE(ARMISD::VLD1x4_UPD)
-    MAKE_CASE(ARMISD::VLD2LN_UPD)
-    MAKE_CASE(ARMISD::VLD3LN_UPD)
-    MAKE_CASE(ARMISD::VLD4LN_UPD)
-    MAKE_CASE(ARMISD::VLD1DUP_UPD)
-    MAKE_CASE(ARMISD::VLD2DUP_UPD)
-    MAKE_CASE(ARMISD::VLD3DUP_UPD)
-    MAKE_CASE(ARMISD::VLD4DUP_UPD)
-    MAKE_CASE(ARMISD::VST1_UPD)
-    MAKE_CASE(ARMISD::VST2_UPD)
-    MAKE_CASE(ARMISD::VST3_UPD)
-    MAKE_CASE(ARMISD::VST4_UPD)
-    MAKE_CASE(ARMISD::VST1x2_UPD)
-    MAKE_CASE(ARMISD::VST1x3_UPD)
-    MAKE_CASE(ARMISD::VST1x4_UPD)
-    MAKE_CASE(ARMISD::VST2LN_UPD)
-    MAKE_CASE(ARMISD::VST3LN_UPD)
-    MAKE_CASE(ARMISD::VST4LN_UPD)
-    MAKE_CASE(ARMISD::WLS)
-    MAKE_CASE(ARMISD::WLSSETUP)
-    MAKE_CASE(ARMISD::LE)
-    MAKE_CASE(ARMISD::LOOP_DEC)
-    MAKE_CASE(ARMISD::CSINV)
-    MAKE_CASE(ARMISD::CSNEG)
-    MAKE_CASE(ARMISD::CSINC)
-    MAKE_CASE(ARMISD::MEMCPYLOOP)
-    MAKE_CASE(ARMISD::MEMSETLOOP)
-#undef MAKE_CASE
-  }
-  return nullptr;
-}
-
 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
                                           EVT VT) const {
   if (!VT.isVector())
@@ -2510,9 +2236,44 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
       Chain = DAG.getStackArgumentTokenFactor(Chain);
-      if (ByValTempChain)
+      if (ByValTempChain) {
+        // In case of large byval copies, re-using the stackframe for tail-calls
+        // can lead to overwriting incoming arguments on the stack. Force
+        // loading these stack arguments before the copy to avoid that.
+        SmallVector<SDValue, 8> IncomingLoad;
+        for (unsigned I = 0; I < OutVals.size(); ++I) {
+          if (Outs[I].Flags.isByVal())
+            continue;
+
+          SDValue OutVal = OutVals[I];
+          LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
+          if (!OutLN)
+            continue;
+
+          FrameIndexSDNode *FIN =
+              dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr());
+          if (!FIN)
+            continue;
+
+          if (!MFI.isFixedObjectIndex(FIN->getIndex()))
+            continue;
+
+          for (const CCValAssign &VA : ArgLocs) {
+            if (VA.isMemLoc())
+              IncomingLoad.push_back(OutVal.getValue(1));
+          }
+        }
+
+        // Update the chain to force loads for potentially clobbered argument
+        // loads to happen before the byval copy.
+        if (!IncomingLoad.empty()) {
+          IncomingLoad.push_back(Chain);
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
+        }
+
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
                             ByValTempChain);
+      }
       AfterFormalArgLoads = true;
     }
 
@@ -3309,8 +3070,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
-  ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE :
-                                                            ARMISD::RET_GLUE;
+  unsigned RetNode =
+      AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
   return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
 }
 
@@ -4826,7 +4587,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     }
   }
 
-  ARMISD::NodeType CompareType;
+  unsigned CompareType;
   switch (CondCode) {
   default:
     CompareType = ARMISD::CMP;
@@ -20904,7 +20665,7 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                           const CallInst &I,
+                                           const CallBase &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
   switch (Intrinsic) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bc2fec3..d0fb58c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -51,319 +51,6 @@ class TargetMachine;
 class TargetRegisterInfo;
 class VectorType;
 
-  namespace ARMISD {
-
-  // ARM Specific DAG Nodes
-  enum NodeType : unsigned {
-    // Start the numbering where the builtin ops and target ops leave off.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    Wrapper,    // Wrapper - A wrapper node for TargetConstantPool,
-                // TargetExternalSymbol, and TargetGlobalAddress.
-    WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
-                // PIC mode.
-    WrapperJT,  // WrapperJT - A wrapper node for TargetJumpTable
-
-    // Add pseudo op to model memcpy for struct byval.
-    COPY_STRUCT_BYVAL,
-
-    CALL,        // Function call.
-    CALL_PRED,   // Function call that's predicable.
-    CALL_NOLINK, // Function call with branch not branch-and-link.
-    tSECALL,     // CMSE non-secure function call.
-    t2CALL_BTI,  // Thumb function call followed by BTI instruction.
-    BRCOND,      // Conditional branch.
-    BR_JT,       // Jumptable branch.
-    BR2_JT,      // Jumptable branch (2 level - jumptable entry is a jump).
-    RET_GLUE,    // Return with a flag operand.
-    SERET_GLUE,  // CMSE Entry function return with a flag operand.
-    INTRET_GLUE, // Interrupt return with an LR-offset and a flag operand.
-
-    PIC_ADD, // Add with a PC operand and a PIC label.
-
-    ASRL, // MVE long arithmetic shift right.
-    LSRL, // MVE long shift right.
-    LSLL, // MVE long shift left.
-
-    CMP,      // ARM compare instructions.
-    CMN,      // ARM CMN instructions.
-    CMPZ,     // ARM compare that sets only Z flag.
-    CMPFP,    // ARM VFP compare instruction, sets FPSCR.
-    CMPFPE,   // ARM VFP signalling compare instruction, sets FPSCR.
-    CMPFPw0,  // ARM VFP compare against zero instruction, sets FPSCR.
-    CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets
-              // FPSCR.
-    FMSTAT,   // ARM fmstat instruction.
-
-    CMOV, // ARM conditional move instructions.
-
-    SSAT, // Signed saturation
-    USAT, // Unsigned saturation
-
-    BCC_i64,
-
-    LSLS,  // Flag-setting shift left.
-    LSRS1, // Flag-setting logical shift right by one bit.
-    ASRS1, // Flag-setting arithmetic shift right by one bit.
-    RRX,   // Shift right one bit with carry in.
-
-    ADDC, // Add with carry
-    ADDE, // Add using carry
-    SUBC, // Sub with carry
-    SUBE, // Sub using carry
-
-    VMOVRRD, // double to two gprs.
-    VMOVDRR, // Two gprs to double.
-    VMOVSR,  // move gpr to single, used for f32 literal constructed in a gpr
-
-    EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
-    EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
-    EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
-
-    TC_RETURN, // Tail call return pseudo.
-
-    THREAD_POINTER,
-
-    DYN_ALLOC, // Dynamic allocation on the stack.
-
-    MEMBARRIER_MCR, // Memory barrier (MCR)
-
-    PRELOAD, // Preload
-
-    WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
-    WIN__DBZCHK, // Windows' divide by zero check
-
-    WLS, // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart
-    WLSSETUP, // Setup for the iteration count of a WLS. See t2WhileLoopSetup.
-    LOOP_DEC, // Really a part of LE, performs the sub
-    LE,       // Low-overhead loops, Loop End
-
-    PREDICATE_CAST,  // Predicate cast for MVE i1 types
-    VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
-
-    MVESEXT,  // Legalization aids for extending a vector into two/four vectors.
-    MVEZEXT,  //  or truncating two/four vectors into one. Eventually becomes
-    MVETRUNC, //  stack store/load sequence, if not optimized to anything else.
-
-    VCMP,  // Vector compare.
-    VCMPZ, // Vector compare to zero.
-    VTST,  // Vector test bits.
-
-    // Vector shift by vector
-    VSHLs, // ...left/right by signed
-    VSHLu, // ...left/right by unsigned
-
-    // Vector shift by immediate:
-    VSHLIMM,  // ...left
-    VSHRsIMM, // ...right (signed)
-    VSHRuIMM, // ...right (unsigned)
-
-    // Vector rounding shift by immediate:
-    VRSHRsIMM, // ...right (signed)
-    VRSHRuIMM, // ...right (unsigned)
-    VRSHRNIMM, // ...right narrow
-
-    // Vector saturating shift by immediate:
-    VQSHLsIMM,   // ...left (signed)
-    VQSHLuIMM,   // ...left (unsigned)
-    VQSHLsuIMM,  // ...left (signed to unsigned)
-    VQSHRNsIMM,  // ...right narrow (signed)
-    VQSHRNuIMM,  // ...right narrow (unsigned)
-    VQSHRNsuIMM, // ...right narrow (signed to unsigned)
-
-    // Vector saturating rounding shift by immediate:
-    VQRSHRNsIMM,  // ...right narrow (signed)
-    VQRSHRNuIMM,  // ...right narrow (unsigned)
-    VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
-
-    // Vector shift and insert:
-    VSLIIMM, // ...left
-    VSRIIMM, // ...right
-
-    // Vector get lane (VMOV scalar to ARM core register)
-    // (These are used for 8- and 16-bit element types only.)
-    VGETLANEu, // zero-extend vector extract element
-    VGETLANEs, // sign-extend vector extract element
-
-    // Vector move immediate and move negated immediate:
-    VMOVIMM,
-    VMVNIMM,
-
-    // Vector move f32 immediate:
-    VMOVFPIMM,
-
-    // Move H <-> R, clearing top 16 bits
-    VMOVrh,
-    VMOVhr,
-
-    // Vector duplicate:
-    VDUP,
-    VDUPLANE,
-
-    // Vector shuffles:
-    VEXT,   // extract
-    VREV64, // reverse elements within 64-bit doublewords
-    VREV32, // reverse elements within 32-bit words
-    VREV16, // reverse elements within 16-bit halfwords
-    VZIP,   // zip (interleave)
-    VUZP,   // unzip (deinterleave)
-    VTRN,   // transpose
-    VTBL1,  // 1-register shuffle with mask
-    VTBL2,  // 2-register shuffle with mask
-    VMOVN,  // MVE vmovn
-
-    // MVE Saturating truncates
-    VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
-    VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
-
-    // MVE float <> half converts
-    VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top
-           // lanes
-    VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
-
-    // MVE VIDUP instruction, taking a start value and increment.
-    VIDUP,
-
-    // Vector multiply long:
-    VMULLs, // ...signed
-    VMULLu, // ...unsigned
-
-    VQDMULH, // MVE vqdmulh instruction
-
-    // MVE reductions
-    VADDVs,  // sign- or zero-extend the elements of a vector to i32,
-    VADDVu,  //   add them all together, and return an i32 of their sum
-    VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask
-    VADDVpu,
-    VADDLVs,  // sign- or zero-extend elements to i64 and sum, returning
-    VADDLVu,  //   the low and high 32-bit halves of the sum
-    VADDLVAs, // Same as VADDLV[su] but also add an input accumulator
-    VADDLVAu, //   provided as low and high halves
-    VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask
-    VADDLVpu,
-    VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
-    VADDLVApu,
-    VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply
-    VMLAVu, //   them and add the results together, returning an i32 of the sum
-    VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
-    VMLAVpu,
-    VMLALVs,  // Same as VMLAV but with i64, returning the low and
-    VMLALVu,  //   high 32-bit halves of the sum
-    VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask
-    VMLALVpu,
-    VMLALVAs,  // Same as VMLALV but also add an input accumulator
-    VMLALVAu,  //   provided as low and high halves
-    VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask
-    VMLALVApu,
-    VMINVu, // Find minimum unsigned value of a vector and register
-    VMINVs, // Find minimum signed value of a vector and register
-    VMAXVu, // Find maximum unsigned value of a vector and register
-    VMAXVs, // Find maximum signed value of a vector and register
-
-    SMULWB,  // Signed multiply word by half word, bottom
-    SMULWT,  // Signed multiply word by half word, top
-    UMLAL,   // 64bit Unsigned Accumulate Multiply
-    SMLAL,   // 64bit Signed Accumulate Multiply
-    UMAAL,   // 64-bit Unsigned Accumulate Accumulate Multiply
-    SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16
-    SMLALBT, // 64-bit signed accumulate multiply bottom, top 16
-    SMLALTB, // 64-bit signed accumulate multiply top, bottom 16
-    SMLALTT, // 64-bit signed accumulate multiply top, top 16
-    SMLALD,  // Signed multiply accumulate long dual
-    SMLALDX, // Signed multiply accumulate long dual exchange
-    SMLSLD,  // Signed multiply subtract long dual
-    SMLSLDX, // Signed multiply subtract long dual exchange
-    SMMLAR,  // Signed multiply long, round and add
-    SMMLSR,  // Signed multiply long, subtract and round
-
-    // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b
-    // stands for.
-    QADD8b,
-    QSUB8b,
-    QADD16b,
-    QSUB16b,
-    UQADD8b,
-    UQSUB8b,
-    UQADD16b,
-    UQSUB16b,
-
-    // Operands of the standard BUILD_VECTOR node are not legalized, which
-    // is fine if BUILD_VECTORs are always lowered to shuffles or other
-    // operations, but for ARM some BUILD_VECTORs are legal as-is and their
-    // operands need to be legalized.  Define an ARM-specific version of
-    // BUILD_VECTOR for this purpose.
-    BUILD_VECTOR,
-
-    // Bit-field insert
-    BFI,
-
-    // Vector OR with immediate
-    VORRIMM,
-    // Vector AND with NOT of immediate
-    VBICIMM,
-
-    // Pseudo vector bitwise select
-    VBSP,
-
-    // Pseudo-instruction representing a memory copy using ldm/stm
-    // instructions.
-    MEMCPY,
-
-    // Pseudo-instruction representing a memory copy using a tail predicated
-    // loop
-    MEMCPYLOOP,
-    // Pseudo-instruction representing a memset using a tail predicated
-    // loop
-    MEMSETLOOP,
-
-    // V8.1MMainline condition select
-    CSINV, // Conditional select invert.
-    CSNEG, // Conditional select negate.
-    CSINC, // Conditional select increment.
-
-    // Vector load N-element structure to all lanes:
-    FIRST_MEMORY_OPCODE,
-    VLD1DUP = FIRST_MEMORY_OPCODE,
-    VLD2DUP,
-    VLD3DUP,
-    VLD4DUP,
-
-    // NEON loads with post-increment base updates:
-    VLD1_UPD,
-    VLD2_UPD,
-    VLD3_UPD,
-    VLD4_UPD,
-    VLD2LN_UPD,
-    VLD3LN_UPD,
-    VLD4LN_UPD,
-    VLD1DUP_UPD,
-    VLD2DUP_UPD,
-    VLD3DUP_UPD,
-    VLD4DUP_UPD,
-    VLD1x2_UPD,
-    VLD1x3_UPD,
-    VLD1x4_UPD,
-
-    // NEON stores with post-increment base updates:
-    VST1_UPD,
-    VST2_UPD,
-    VST3_UPD,
-    VST4_UPD,
-    VST2LN_UPD,
-    VST3LN_UPD,
-    VST4LN_UPD,
-    VST1x2_UPD,
-    VST1x3_UPD,
-    VST1x4_UPD,
-
-    // Load/Store of dual registers
-    LDRD,
-    STRD,
-    LAST_MEMORY_OPCODE = STRD,
-  };
-
-  } // end namespace ARMISD
-
   namespace ARM {
   /// Possible values of current rounding mode, which is specified in bits
   /// 23:22 of FPSCR.
@@ -427,8 +114,6 @@ class VectorType;
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                             SelectionDAG &DAG) const override;
 
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
     bool isSelectSupported(SelectSupportKind Kind) const override {
       // ARM does not support scalar condition selects on vectors.
       return (Kind != ScalarCondVectorVal);
@@ -630,8 +315,7 @@ class VectorType;
     bool isFPImmLegal(const APFloat &Imm, EVT VT,
                       bool ForCodeSize = false) const override;
 
-    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                            const CallInst &I,
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I,
                             MachineFunction &MF,
                             unsigned Intrinsic) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index f4326de..5d4e3ac 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -115,6 +115,7 @@ class CDE_CX1_Instr<string iname, CX_Params params>
                   !con(params.Iops1, (ins imm_13b:$imm), params.PredOp),
                   !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $imm"),
                   params.Cstr> {
+  bits<0> p;
   bits<13> imm;
   bits<4> Rd;
 
@@ -131,6 +132,7 @@ class CDE_CX2_Instr<string iname, CX_Params params>
                   !con(params.Iops2, (ins imm_9b:$imm), params.PredOp),
                   !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $imm"),
                   params.Cstr> {
+  bits<0> p;
   bits<9> imm;
   bits<4> Rd;
   bits<4> Rn;
@@ -149,6 +151,7 @@ class CDE_CX3_Instr<string iname, CX_Params params>
                   !con(params.Iops3, (ins imm_6b:$imm), params.PredOp),
                   !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $Rm, $imm"),
                   params.Cstr> {
+  bits<0> p;
   bits<6> imm;
   bits<4> Rd;
   bits<4> Rn;
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 1ad2485..1cd1a9a 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1220,6 +1220,7 @@ class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
                string opc, string asm, string cstr, list<dag> pattern>
   : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   bits<0> s;
+  bits<0> p;
   let OutOperandList = !con(oops, (outs s_cc_out:$s));
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${s}${p}", asm);
@@ -1244,6 +1245,7 @@ class Thumb1pI<dag oops, dag iops, AddrMode am, int sz,
                InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
   : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<0> p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${p}", asm);
@@ -1343,6 +1345,7 @@ class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
               InstrItinClass itin,
               string opc, string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<0> p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${p}", asm);
@@ -1361,6 +1364,7 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
                InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<0> p;
   bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
   let Inst{20} = s;
 
@@ -2221,6 +2225,7 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
             InstrItinClass itin, string opc, string dt, string asm, string cstr,
             list<dag> pattern>
   : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  bits<0> p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
@@ -2234,6 +2239,7 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
              InstrItinClass itin, string opc, string asm, string cstr,
              list<dag> pattern>
   : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  bits<0> p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${p}", "\t", asm);
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index c684de7..f370547 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -25,7 +25,8 @@
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
 
-ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {}
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI, RI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst ARMInstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h
index 178d7a2..9feaf14 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -35,7 +35,7 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
 
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index f7176a6..ddc8941 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -40,7 +40,7 @@ def SDT_ARMCMov : SDTypeProfile<1, 4, [
   SDTCisVT<4, FlagsVT>,    // in flags
 ]>;
 
-def SDT_ARMBrcond : SDTypeProfile<0, 2, [
+def SDT_ARMBrcond : SDTypeProfile<0, 3, [
   SDTCisVT<0, OtherVT>,    // target basic block
   SDTCisVT<1, CondCodeVT>, // condition code
   SDTCisVT<2, FlagsVT>,    // in flags
@@ -133,9 +133,16 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
                                               SDTCisInt<0>,
                                               SDTCisInt<4>]>;
 
+// Signed multiply accumulate long dual
 def ARMSmlald        : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
+
+// Signed multiply accumulate long dual exchange
 def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
+
+// Signed multiply subtract long dual
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
+
+// Signed multiply subtract long dual exchange
 def ARMSmlsldx       : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
 
 def SDT_ARMCSel : SDTypeProfile<1, 4, [
@@ -146,8 +153,13 @@ def SDT_ARMCSel : SDTypeProfile<1, 4, [
   SDTCisVT<3, FlagsVT>     // in flags
 ]>;
 
+// Conditional select invert.
 def ARMcsinv : SDNode<"ARMISD::CSINV", SDT_ARMCSel>;
+
+// Conditional select negate.
 def ARMcsneg : SDNode<"ARMISD::CSNEG", SDT_ARMCSel>;
+
+// Conditional select increment.
 def ARMcsinc : SDNode<"ARMISD::CSINC", SDT_ARMCSel>;
 
 def SDT_MulHSR       : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
@@ -155,110 +167,197 @@ def SDT_MulHSR       : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
                                             SDTCisSameAs<0, 2>,
                                             SDTCisSameAs<0, 3>]>;
 
+// Signed multiply long, round and add
 def ARMsmmlar      : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>;
+
+// Signed multiply long, subtract and round
 def ARMsmmlsr      : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>;
 
-// Node definitions.
+
+// Wrapper - A wrapper node for TargetConstantPool,
+// TargetExternalSymbol, and TargetGlobalAddress.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+
+// WrapperPIC - A wrapper node for TargetGlobalAddress in
+// PIC mode.
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
+
+// WrapperJT - A wrapper node for TargetJumpTable
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntUnaryOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
                               [SDNPHasChain, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Add pseudo op to model memcpy for struct byval.
 def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
                                 SDT_ARMStructByVal,
                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                  SDNPMayStore, SDNPMayLoad]>;
 
+// Function call.
 def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
+
+// Function call that's predicable.
 def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
+
+// Function call with branch not branch-and-link.
 def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
 
+// Return with a flag operand.
 def ARMretglue       : SDNode<"ARMISD::RET_GLUE", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// CMSE Entry function return with a flag operand.
 def ARMseretglue     : SDNode<"ARMISD::SERET_GLUE", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Interrupt return with an LR-offset and a flag operand.
 def ARMintretglue    : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// ARM conditional move instructions.
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov>;
 
+// Signed saturation
 def ARMssat   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
 
+// Unsigned saturation
 def ARMusat   : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
 
+// Conditional branch.
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain]>;
 
+// Jumptable branch.
 def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
                               [SDNPHasChain]>;
+
+// Jumptable branch (2 level - jumptable entry is a jump).
 def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
                               [SDNPHasChain]>;
 
 def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
                               [SDNPHasChain]>;
 
+// ARM compare instructions.
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp>;
 
+// ARM CMN instructions.
 def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp>;
 
+// ARM compare that sets only Z flag.
 def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, [SDNPCommutative]>;
 
+// Add with a PC operand and a PIC label.
 def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
 
+// MVE long arithmetic shift right.
 def ARMasrl          : SDNode<"ARMISD::ASRL", SDT_ARMIntShiftParts, []>;
+
+// MVE long shift right.
 def ARMlsrl          : SDNode<"ARMISD::LSRL", SDT_ARMIntShiftParts, []>;
+
+// MVE long shift left.
 def ARMlsll          : SDNode<"ARMISD::LSLL", SDT_ARMIntShiftParts, []>;
 
+// Flag-setting logical shift right by one bit.
 def ARMlsrs1 : SDNode<"ARMISD::LSRS1", SDTIntUnaryOpWithFlagsOut>;
+
+// Flag-setting arithmetic shift right by one bit.
 def ARMasrs1 : SDNode<"ARMISD::ASRS1", SDTIntUnaryOpWithFlagsOut>;
+
+// Shift right one bit with carry in.
 def ARMrrx   : SDNode<"ARMISD::RRX"  , SDTIntUnaryOpWithFlagsIn>;
 
+// Add with carry
 def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
                               [SDNPCommutative]>;
+
+// Sub with carry
 def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+
+// Flag-setting shift left.
 def ARMlsls          : SDNode<"ARMISD::LSLS",  SDTBinaryArithWithFlags>;
+
+// Add using carry
 def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+
+// Sub using carry
 def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
 
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+
+// SjLj exception handling setjmp.
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
                                SDT_ARMEH_SJLJ_Setjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
+
+// SjLj exception handling longjmp.
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
+
+// SjLj exception handling setup_dispatch.
 def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
                                       SDT_ARMEH_SJLJ_SetupDispatch,
                                       [SDNPHasChain, SDNPSideEffect]>;
 
+// Memory barrier (MCR)
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain, SDNPSideEffect]>;
+
+// Preload
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
+// Tail call return pseudo.
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
+// Bit-field insert
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 
+// Pseudo-instruction representing a memory copy using ldm/stm instructions.
 def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                          SDNPMayStore, SDNPMayLoad]>;
 
+// Signed multiply word by half word, bottom
 def ARMsmulwb       : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
+
+// Signed multiply word by half word, top
 def ARMsmulwt       : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
+
+// 64bit Unsigned Accumulate Multiply
+def ARMumlal : SDNode<"ARMISD::UMLAL", SDT_LongMac>;
+
+// 64bit Signed Accumulate Multiply
+def ARMsmlal : SDNode<"ARMISD::SMLAL", SDT_LongMac>;
+
+// 64-bit Unsigned Accumulate Accumulate Multiply
+def ARMumaal : SDNode<"ARMISD::UMAAL", SDT_LongMac>;
+
+// 64-bit signed accumulate multiply bottom, bottom 16
 def ARMsmlalbb      : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>;
+
+// 64-bit signed accumulate multiply bottom, top 16
 def ARMsmlalbt      : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
+
+// 64-bit signed accumulate multiply top, bottom 16
 def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
+
+// 64-bit signed accumulate multiply top, top 16
 def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
 
+// Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b
+// stands for.
 def ARMqadd8b       : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>;
 def ARMqsub8b       : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
 def ARMqadd16b      : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
@@ -270,13 +369,15 @@ def ARMuqadd16b      : SDNode<"ARMISD::UQADD16b", SDT_ARMAnd, []>;
 def ARMuqsub16b      : SDNode<"ARMISD::UQSUB16b", SDT_ARMAnd, []>;
 
 def SDT_ARMldrd     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def ARMldrd         : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
 def SDT_ARMstrd     : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+
+// Load/Store of dual registers
+def ARMldrd         : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def ARMstrd         : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 // Vector operations shared between NEON and MVE
 
+// Vector duplicate
 def ARMvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
 
 // VDUPLANE can produce a quad-register result from a double-register source,
@@ -287,40 +388,65 @@ def ARMvduplane  : SDNode<"ARMISD::VDUPLANE",
 
 def SDTARMVIDUP  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisVT<1, i32>,
                                           SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+// MVE VIDUP instruction, taking a start value and increment.
 def ARMvidup    : SDNode<"ARMISD::VIDUP", SDTARMVIDUP>;
 
 def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+
+// reverse elements within 64-bit doublewords
 def ARMvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+
+// reverse elements within 32-bit words
 def ARMvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+
+// reverse elements within 16-bit halfwords
 def ARMvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
 
 def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>,
                                          SDTCisVT<2, i32>]>;
+
+// Vector get lane (VMOV scalar to ARM core register)
+// (These are used for 8- and 16-bit element types only.)
 def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
 def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
 
 def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+
+// Vector move immediate and move negated immediate
 def ARMvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
 def ARMvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+
+// Vector move f32 immediate
 def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
 
 def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
                                            SDTCisVT<2, i32>]>;
+
+// Vector OR with immediate
 def ARMvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+
+// Vector AND with NOT of immediate
 def ARMvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
 
 def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                         SDTCisVT<2, i32>]>;
 def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                      SDTCisSameAs<0, 2>,]>;
+
+// Vector shift by immediate
 def ARMvshlImm   : SDNode<"ARMISD::VSHLIMM", SDTARMVSHIMM>;
 def ARMvshrsImm  : SDNode<"ARMISD::VSHRsIMM", SDTARMVSHIMM>;
 def ARMvshruImm  : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
+
+// Vector shift by vector
 def ARMvshls     : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
 def ARMvshlu     : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
 
 def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
                                          SDTCisSameAs<1, 2>]>;
+
+// Vector multiply long
 def ARMvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
 def ARMvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
 
@@ -328,9 +454,13 @@ def SDTARMVCMP    : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                          SDTCisInt<3>]>;
 def SDTARMVCMPZ   : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
 
+// Vector compare.
 def ARMvcmp      : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
+
+// Vector compare to zero.
 def ARMvcmpz     : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
 
+// Reinterpret the current contents of a vector register
 // 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
 // vector register as a different vector type, without changing the contents of
 // the register. It differs from 'bitconvert' in that bitconvert reinterprets
@@ -5894,13 +6024,17 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
 // The main point of having separate instruction are extra unmodelled effects
 // (compared to ordinary calls) like stack pointer change.
 
+// Windows' __chkstk call to do stack probing.
 def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
                       [SDNPHasChain, SDNPSideEffect]>;
+
 let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in
   def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
 
+// Windows' divide by zero check
 def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
                          [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+
 let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in
   def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
                                [(win__dbzchk tGPR:$divisor)]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index e244134..0973187 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -384,6 +384,22 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
                             (VTI.Vec MQPR:$inactive)))>;
 }
 
+def vadd : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fadd node:$lhs, node:$rhs),
+                     (int_arm_mve_vadd node:$lhs, node:$rhs)]>;
+def vsub : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fsub node:$lhs, node:$rhs),
+                     (int_arm_mve_vsub node:$lhs, node:$rhs)]>;
+def vmul : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fmul node:$lhs, node:$rhs),
+                     (int_arm_mve_vmul node:$lhs, node:$rhs)]>;
+def vminnm : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fminnum node:$lhs, node:$rhs),
+                     (int_arm_mve_vminnm node:$lhs, node:$rhs)]>;
+def vmaxnm : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fmaxnum node:$lhs, node:$rhs),
+                     (int_arm_mve_vmaxnm node:$lhs, node:$rhs)]>;
+
 // --------- Start of base classes for the instructions themselves
 
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@@ -683,8 +699,13 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
 def SDTVecReduceP : SDTypeProfile<1, 2, [    // VADDLVp
   SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>
 ]>;
+
+// sign- or zero-extend the elements of a vector to i32,
+// add them all together, and return an i32 of their sum
 def ARMVADDVs       : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
 def ARMVADDVu       : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+
+// Same as VADDV[su] but with a v4i1 predicate mask
 def ARMVADDVps      : SDNode<"ARMISD::VADDVps", SDTVecReduceP>;
 def ARMVADDVpu      : SDNode<"ARMISD::VADDVpu", SDTVecReduceP>;
 
@@ -806,9 +827,19 @@ multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> {
   defvar InstN = !cast<Instruction>(NAME # "no_acc");
 
   defvar letter = VTI.SuffixLetter;
+
+  // sign- or zero-extend elements to i64 and sum, returning
+  // the low and high 32-bit halves of the sum
   defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>;
+
+  // Same as VADDLV[su] but also add an input accumulator
+  // provided as low and high halves
   defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>;
+
+  // Same as VADDLV[su] but with a v4i1 predicate mask
   defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>;
+
+  // Same as VADDLVp[su] but with a v4i1 predicate mask
   defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>;
 
   let Predicates = [HasMVEInt] in {
@@ -943,9 +974,17 @@ multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
 def SDTVecReduceR : SDTypeProfile<1, 2, [   // Reduction of an integer and vector into an integer
   SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
 ]>;
+
+// Find minimum unsigned value of a vector and register
 def ARMVMINVu       : SDNode<"ARMISD::VMINVu", SDTVecReduceR>;
+
+// Find minimum signed value of a vector and register
 def ARMVMINVs       : SDNode<"ARMISD::VMINVs", SDTVecReduceR>;
+
+// Find maximum unsigned value of a vector and register
 def ARMVMAXVu       : SDNode<"ARMISD::VMAXVu", SDTVecReduceR>;
+
+// Find maximum signed value of a vector and register
 def ARMVMAXVs       : SDNode<"ARMISD::VMAXVs", SDTVecReduceR>;
 
 defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
@@ -1146,16 +1185,31 @@ def SDTVecReduce2LAP : SDTypeProfile<2, 5, [    // VMLALVA
   SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
   SDTCisVec<4>, SDTCisVec<5>, SDTCisVec<6>
 ]>;
+
+// sign- or zero-extend the elements of two vectors to i32, multiply
+// them and add the results together, returning an i32 of the sum
 def ARMVMLAVs       : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>;
 def ARMVMLAVu       : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>;
+
+// Same as VMLAV but with i64, returning the low and
+// high 32-bit halves of the sum
 def ARMVMLALVs      : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>;
 def ARMVMLALVu      : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>;
+
+// Same as VMLALV but also add an input accumulator
+// provided as low and high halves
 def ARMVMLALVAs     : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
 def ARMVMLALVAu     : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+
+// Same as VMLAV[su] with a v4i1 predicate mask
 def ARMVMLAVps      : SDNode<"ARMISD::VMLAVps", SDTVecReduce2P>;
 def ARMVMLAVpu      : SDNode<"ARMISD::VMLAVpu", SDTVecReduce2P>;
+
+// Same as VMLALV[su] with a v4i1 predicate mask
 def ARMVMLALVps     : SDNode<"ARMISD::VMLALVps", SDTVecReduce2LP>;
 def ARMVMLALVpu     : SDNode<"ARMISD::VMLALVpu", SDTVecReduce2LP>;
+
+// Same as VMLALVA[su] with a v4i1 predicate mask
 def ARMVMLALVAps    : SDNode<"ARMISD::VMLALVAps", SDTVecReduce2LAP>;
 def ARMVMLALVApu    : SDNode<"ARMISD::VMLALVApu", SDTVecReduce2LAP>;
 
@@ -1441,7 +1495,7 @@ class MVE_VMINMAXNM<string iname, string suffix, bits<2> sz, bit bit_21,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> {
+multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic PredInt> {
   def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size, bit_4>;
 
   let Predicates = [HasMVEFloat] in {
@@ -1449,10 +1503,10 @@ multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode
   }
 }
 
-defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, fmaxnum, int_arm_mve_max_predicated>;
-defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, fmaxnum, int_arm_mve_max_predicated>;
-defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, fminnum, int_arm_mve_min_predicated>;
-defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_mve_min_predicated>;
+defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, vmaxnm, int_arm_mve_max_predicated>;
+defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, vmaxnm, int_arm_mve_max_predicated>;
+defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, vminnm, int_arm_mve_min_predicated>;
+defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, vminnm, int_arm_mve_min_predicated>;
 
 
 class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
@@ -1997,6 +2051,7 @@ class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
   let validForTailPredication = 1;
 }
 
+// MVE vqdmulh instruction
 def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>;
 
 multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI,
@@ -3566,7 +3621,7 @@ class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
+multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDPatternOperator Op,
                           Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
@@ -3577,7 +3632,7 @@ multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
 }
 
 multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated, IdentityVec>;
+  : MVE_VMULT_fp_m<"vmul", VTI, vmul, int_arm_mve_mul_predicated, IdentityVec>;
 
 def ARMimmOneF: PatLeaf<(bitconvert (v4f32 (ARMvmovFPImm (i32 112))))>; // 1.0 float
 def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 half
@@ -3674,6 +3729,10 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
     if fms then {
       def : Pat<(VTI.Vec (fma (fneg m1), m2, add)),
                 (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma (fneg m1), m2, add)),
+                (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma m1, (fneg m2), add)),
+                (Inst $add, $m1, $m2)>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma (fneg m1), m2, add)),
                                   add)),
@@ -3685,6 +3744,8 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
     } else {
       def : Pat<(VTI.Vec (fma m1, m2, add)),
                 (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma m1, m2, add)),
+                (Inst $add, $m1, $m2)>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma m1, m2, add)),
                                   add)),
@@ -3701,7 +3762,7 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
 defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
-                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
+                            SDPatternOperator Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
@@ -3713,9 +3774,9 @@ multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
 }
 
 multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated, IdentityVec>;
+  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, vadd, int_arm_mve_add_predicated, IdentityVec>;
 multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated, IdentityVec>;
+  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, vsub, int_arm_mve_sub_predicated, IdentityVec>;
 
 def ARMimmMinusZeroF: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 1664))))>; // -0.0 float
 def ARMimmMinusZeroH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2688))))>; // -0.0 half
@@ -4093,7 +4154,7 @@ class MVE_VMAXMINNMA<string iname, string suffix, bits<2> size, bit bit_12,
 }
 
 multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int,
+                      SDPatternOperator unpred_op, Intrinsic pred_int,
                       bit bit_12> {
   def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size, bit_12>;
   defvar Inst = !cast<Instruction>(NAME);
@@ -4113,13 +4174,13 @@ multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
 }
 
 multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12>
-  : MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>;
+  : MVE_VMAXMINNMA_m<"vmaxnma", VTI, vmaxnm, int_arm_mve_vmaxnma_predicated, bit_12>;
 
 defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>;
 defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>;
 
 multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12>
-  : MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>;
+  : MVE_VMAXMINNMA_m<"vminnma", VTI, vminnm, int_arm_mve_vminnma_predicated, bit_12>;
 
 defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>;
 defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;
@@ -4414,6 +4475,7 @@ let Predicates = [HasMVEInt] in {
   defm PEOR   : two_predops<xor, t2EORrr>;
 }
 
+// Predicate cast for MVE i1 types
 // Occasionally we need to cast between a i32 and a boolean vector, for
 // example when moving between rGPR and VPR.P0 as part of predicate vector
 // shuffles. We also sometimes need to cast between different predicate
@@ -4810,6 +4872,7 @@ defm MVE_VQMOVNu32  : MVE_VxMOVxN_halves<"vqmovn",  "u32", 0b1, 0b1, 0b01>;
 defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
 defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
 
+// MVE vmovn
 def MVEvmovn       : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
 
 multiclass MVE_VMOVN_p<Instruction Inst, bit top,
@@ -4880,7 +4943,11 @@ defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>;
 
 def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
                                         SDTCisVec<2>, SDTCisVT<3, i32>]>;
+
+// Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
 def MVEvqmovns   : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>;
+
+// Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
 def MVEvqmovnu   : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>;
 
 let Predicates = [HasMVEInt] in {
@@ -4938,7 +5005,11 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
 
 def SDTARMVCVTL    : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisVT<2, i32>]>;
+
+// MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
 def MVEvcvtn       : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>;
+
+// MVE vcvt f16 -> f32, extending from either the bottom or top lanes
 def MVEvcvtl       : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>;
 
 multiclass MVE_VCVT_f2h_m<string iname, int half> {
@@ -5342,21 +5413,22 @@ defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16, subnuw, ARMvshruImm>;
 defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32, subnuw, ARMvshruImm>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
-                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
+                            SDPatternOperator Op, Intrinsic PredInt,
+                            SDPatternOperator IdentityVec> {
   def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
                               !cast<Instruction>(NAME), IdentityVec>;
 }
 
 let Predicates = [HasMVEFloat] in {
-  defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
+  defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, vadd,
                                           int_arm_mve_add_predicated, ARMimmMinusZeroF>;
-  defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
+  defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, vadd,
                                           int_arm_mve_add_predicated, ARMimmMinusZeroH>;
 
-  defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
+  defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, vsub,
                                           int_arm_mve_sub_predicated, ARMimmAllZerosV>;
-  defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
+  defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, vsub,
                                           int_arm_mve_sub_predicated, ARMimmAllZerosV>;
 }
 
@@ -5539,7 +5611,7 @@ defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> {
   let validForTailPredication = 1 in
   def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
-  defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
+  defm : MVE_TwoOpPatternDup<VTI, vmul, int_arm_mve_mul_predicated, (? ),
                              !cast<Instruction>(NAME), IdentityVec>;
 }
 
@@ -5612,6 +5684,8 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
     if scalar_addend then {
       def : Pat<(VTI.Vec (fma v1, v2, vs)),
                 (VTI.Vec (Inst v1, v2, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma v1, v2, vs)),
+                (VTI.Vec (Inst v1, v2, is))>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma v1, v2, vs)),
                                   v1)),
@@ -5621,6 +5695,10 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
                 (VTI.Vec (Inst v2, v1, is))>;
       def : Pat<(VTI.Vec (fma vs, v1, v2)),
                 (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma v1, vs, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma vs, v1, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma vs, v2, v1)),
                                   v1)),
@@ -6865,6 +6943,9 @@ class MVE_WLSTP<string asm, bits<2> size>
 
 def SDT_MVEMEMCPYLOOPNODE
     : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+// Pseudo-instruction representing a memory copy using a tail predicated
+// loop
 def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 
@@ -6877,6 +6958,9 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CPSR] in {
 
 def SDT_MVEMEMSETLOOPNODE
     : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>;
+
+// Pseudo-instruction representing a memset using a tail predicated
+// loop
 def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE,
                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 37f0103..90e74a5 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -475,6 +475,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
 //===----------------------------------------------------------------------===//
 
 def SDTARMVTST    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+
+// Vector test bits.
 def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVTST>;
 
 // Types for vector shift by immediates.  The "SHX" version is for long and
@@ -487,10 +489,12 @@ def SDTARMVSHINSIMM  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
 
 def NEONvshrnImm     : SDNode<"ARMISD::VSHRNIMM", SDTARMVSHXIMM>;
 
+// Vector rounding shift by immediate
 def NEONvrshrsImm    : SDNode<"ARMISD::VRSHRsIMM", SDTARMVSHIMM>;
 def NEONvrshruImm    : SDNode<"ARMISD::VRSHRuIMM", SDTARMVSHIMM>;
 def NEONvrshrnImm    : SDNode<"ARMISD::VRSHRNIMM", SDTARMVSHXIMM>;
 
+// Vector saturating shift by immediate
 def NEONvqshlsImm    : SDNode<"ARMISD::VQSHLsIMM", SDTARMVSHIMM>;
 def NEONvqshluImm    : SDNode<"ARMISD::VQSHLuIMM", SDTARMVSHIMM>;
 def NEONvqshlsuImm   : SDNode<"ARMISD::VQSHLsuIMM", SDTARMVSHIMM>;
@@ -498,13 +502,16 @@ def NEONvqshrnsImm   : SDNode<"ARMISD::VQSHRNsIMM", SDTARMVSHXIMM>;
 def NEONvqshrnuImm   : SDNode<"ARMISD::VQSHRNuIMM", SDTARMVSHXIMM>;
 def NEONvqshrnsuImm  : SDNode<"ARMISD::VQSHRNsuIMM", SDTARMVSHXIMM>;
 
+// Vector saturating rounding shift by immediate
 def NEONvqrshrnsImm  : SDNode<"ARMISD::VQRSHRNsIMM", SDTARMVSHXIMM>;
 def NEONvqrshrnuImm  : SDNode<"ARMISD::VQRSHRNuIMM", SDTARMVSHXIMM>;
 def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
 
+// Vector shift and insert
 def NEONvsliImm      : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
 def NEONvsriImm      : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
 
+// Pseudo vector bitwise select
 def NEONvbsp      : SDNode<"ARMISD::VBSP",
                            SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
@@ -518,15 +525,25 @@ def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
 def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
                                          SDTCisSameAs<0, 2>,
                                          SDTCisSameAs<0, 3>]>;
+
+// zip (interleave)
 def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+
+// unzip (deinterleave)
 def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+
+// transpose
 def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
 def SDTARMVTBL1   : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
                                          SDTCisVT<2, v8i8>]>;
 def SDTARMVTBL2   : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
                                          SDTCisVT<2, v8i8>, SDTCisVT<3, v8i8>]>;
+
+// 1-register shuffle with mask
 def NEONvtbl1     : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
+
+// 2-register shuffle with mask
 def NEONvtbl2     : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
 
 
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index 0c5ea3e..0ee98e6 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -14,6 +14,7 @@
 // Thumb specific DAG Nodes.
 //
 
+// CMSE non-secure function call.
 def ARMtsecall : SDNode<"ARMISD::tSECALL", SDT_ARMcall,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                          SDNPVariadic]>;
@@ -483,6 +484,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
             T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
     // A6.2.3 & A8.6.25
+    bits<0> p;
     bits<4> Rm;
     let Inst{6-3} = Rm;
     let Inst{2-0} = 0b000;
@@ -491,6 +493,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
               Requires<[IsThumb, Has8MSecExt]>,
               T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    bits<0> p;
     bits<4> Rm;
     let Inst{6-3} = Rm;
     let Inst{2-0} = 0b100;
@@ -523,6 +526,7 @@ let isCall = 1,
                   "bl${p}\t$func",
                   [(ARMcall tglobaladdr:$func)]>,
              Requires<[IsThumb]>, Sched<[WriteBrL]> {
+    bits<0> p;
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -536,6 +540,7 @@ let isCall = 1,
                  (outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
                    "blx${p}\t$func", []>,
               Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
+    bits<0> p;
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -550,6 +555,7 @@ let isCall = 1,
                   "blx${p}\t$func", []>,
               Requires<[IsThumb, HasV5T]>,
               T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
+    bits<0> p;
     bits<4> func;
     let Inst{6-3} = func;
     let Inst{2-0} = 0b000;
@@ -565,6 +571,7 @@ let isCall = 1,
                    "blxns${p}\t$func", []>,
                 Requires<[IsThumb, Has8MSecExt]>,
                 T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
+    bits<0> p;
     bits<4> func;
     let Inst{6-3} = func;
     let Inst{2-0} = 0b100;
@@ -824,6 +831,7 @@ let hasSideEffects = 0 in {
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
 def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
         IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<0> p;
   bits<3> Rn;
   bits<8> regs;
   let Inst{10-8} = Rn;
@@ -854,6 +862,7 @@ def tSTMIA_UPD : Thumb1I<(outs tGPR:$wb),
                          AddrModeNone, 2, IIC_iStore_mu,
                          "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
                      T1Encoding<{1,1,0,0,0,?}> {
+  bits<0> p;
   bits<3> Rn;
   bits<8> regs;
   let Inst{10-8} = Rn;
@@ -872,6 +881,7 @@ def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                IIC_iPop,
                "pop${p}\t$regs", []>,
            T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> {
+  bits<0> p;
   bits<16> regs;
   let Inst{8}   = regs{15};
   let Inst{7-0} = regs{7-0};
@@ -882,6 +892,7 @@ def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                 IIC_iStore_m,
                 "push${p}\t$regs", []>,
             T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> {
+  bits<0> p;
   bits<16> regs;
   let Inst{8}   = regs{14};
   let Inst{7-0} = regs{7-0};
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index c229c8e..596196c 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2059,6 +2059,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
   def IA :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2074,6 +2075,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
   def IA_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
           itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2089,6 +2091,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
   def DB :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2104,6 +2107,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
   def DB_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
           itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2128,6 +2132,7 @@ multiclass thumb2_st_mult<string asm, InstrItinClass itin,
   def IA :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2146,6 +2151,7 @@ multiclass thumb2_st_mult<string asm, InstrItinClass itin,
   def IA_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
           itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2164,6 +2170,7 @@ multiclass thumb2_st_mult<string asm, InstrItinClass itin,
   def DB :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -2182,6 +2189,7 @@ multiclass thumb2_st_mult<string asm, InstrItinClass itin,
   def DB_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
           itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<0>  p;
     bits<4>  Rn;
     bits<16> regs;
 
@@ -4030,9 +4038,11 @@ def t2TBH : T2I<(outs), (ins (addrmode_tbh $Rn, $Rm):$addr), IIC_Br,
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects ", "two operands. :(
 let isBranch = 1, isTerminator = 1 in
-def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
-                "b", ".w\t$target",
-                [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
+def t2Bcc : Thumb2XI<(outs), (ins brtarget:$target, pred:$p),
+                     AddrModeNone, 4, IIC_Br,
+                     "b${p}.w\t$target", "",
+                     [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+            Sched<[WriteBr]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
@@ -5481,6 +5491,7 @@ class V8_1MI<dag oops, dag iops, AddrMode am, InstrItinClass itin, string asm,
 def t2CLRM : V8_1MI<(outs),
                     (ins pred:$p, reglist_with_apsr:$regs, variable_ops),
                     AddrModeNone, NoItinerary, "clrm${p}", "$regs", "", []> {
+  bits<0> p;
   bits<16> regs;
 
   let Inst{31-16} = 0b1110100010011111;
@@ -5509,6 +5520,7 @@ def t2BF_LabelPseudo
 
 def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p),
                  !strconcat("bf", "${p}"), "$b_label, $label"> {
+  bits<0> p;
   bits<4> b_label;
   bits<16> label;
 
@@ -5540,6 +5552,7 @@ def t2BFic : t2BF<(ins bflabel_u4:$b_label, bflabel_s12:$label,
 
 def t2BFr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p),
                  !strconcat("bfx", "${p}"), "$b_label, $Rn"> {
+  bits<0> p;
   bits<4> b_label;
   bits<4> Rn;
 
@@ -5551,6 +5564,7 @@ def t2BFr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p),
 
 def t2BFLi : t2BF<(ins bflabel_u4:$b_label, bflabel_s18:$label, pred:$p),
                   !strconcat("bfl", "${p}"), "$b_label, $label"> {
+  bits<0> p;
   bits<4> b_label;
   bits<18> label;
 
@@ -5563,6 +5577,7 @@ def t2BFLi : t2BF<(ins bflabel_u4:$b_label, bflabel_s18:$label, pred:$p),
 
 def t2BFLr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p),
                   !strconcat("bflx", "${p}"), "$b_label, $Rn"> {
+  bits<0> p;
   bits<4> b_label;
   bits<4> Rn;
 
@@ -5581,6 +5596,25 @@ class t2LOL<dag oops, dag iops, string asm, string ops>
   let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
 }
 
+// Setup for the iteration count of a WLS. See t2WhileLoopSetup.
+def arm_wlssetup
+    : SDNode<"ARMISD::WLSSETUP",
+             SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<1, 0>]>,
+             [SDNPSideEffect]>;
+
+// Low-overhead loops, While Loop Start branch. See t2WhileLoopStart
+def arm_wls : SDNode<"ARMISD::WLS",
+                     SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>,
+                     [SDNPHasChain]>;
+
+// Really a part of LE, performs the sub
+def arm_loop_dec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
+
+// Low-overhead loops, Loop End
+def arm_le : SDNode<"ARMISD::LE",
+                    SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>,
+                    [SDNPHasChain]>;
+
 let isNotDuplicable = 1 in {
 def t2WLS : t2LOL<(outs GPRlr:$LR),
                   (ins rGPR:$Rn, wlslabel_u11:$label),
@@ -5650,16 +5684,19 @@ def t2DoLoopStartTP :
 // t2WhileLoopSetup to setup LR and t2WhileLoopStart to perform the branch. Not
 // valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations
 // into a t2WhileLoopStartLR (or expanded).
+let hasSideEffects = 1 in
 def t2WhileLoopSetup :
-  t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>;
+    t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br,
+                 [(set i32:$lr, (arm_wlssetup i32:$tc))]>;
 
 // A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and
 // t2LoopEnd together represent a LE instruction. Ideally these are converted
 // to a t2LoopEndDec which is lowered as a single instruction.
 let hasSideEffects = 0 in
 def t2LoopDec :
-  t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
-               4, IIC_Br, []>, Sched<[WriteBr]>;
+    t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br,
+                 [(set i32:$Rm, (arm_loop_dec i32:$Rn, timm:$size))]>,
+    Sched<[WriteBr]>;
 
 let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
 // The branch in a t2WhileLoopSetup/t2WhileLoopStart pair, eventually turned
@@ -5667,8 +5704,8 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
 def t2WhileLoopStart :
     t2PseudoInst<(outs),
                  (ins GPRlr:$tc, brtarget:$target),
-                 4, IIC_Br, []>,
-                 Sched<[WriteBr]>;
+                 4, IIC_Br, [(arm_wls i32:$tc, bb:$target)]>,
+    Sched<[WriteBr]>;
 
 // WhileLoopStartLR that sets up LR and branches on zero, equivalent to WLS. It
 // is lowered in the ARMLowOverheadLoops pass providing the branches are within
@@ -5690,8 +5727,9 @@ def t2WhileLoopStartTP :
 
 // t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair.
 def t2LoopEnd :
-  t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target),
-  8, IIC_Br, []>, Sched<[WriteBr]>;
+    t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target),
+                 8, IIC_Br, [(arm_le i32:$tc, bb:$target)]>,
+    Sched<[WriteBr]>;
 
 // The combination of a t2LoopDec and t2LoopEnd, performing both the LR
 // decrement and branch as a single instruction. Is lowered to a LE or
@@ -5803,6 +5841,7 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasPACBTI] in {
 def t2PACG : V8_1MI<(outs rGPR:$Rd),
                     (ins pred:$p, GPRnopc:$Rn, GPRnopc:$Rm),
                     AddrModeNone, NoItinerary, "pacg${p}", "$Rd, $Rn, $Rm", "", []> {
+  bits<0> p;
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -5818,6 +5857,7 @@ let hasSideEffects = 1 in {
 class PACBTIAut<dag iops, string asm, bit b>
   : V8_1MI<(outs), iops,
            AddrModeNone, NoItinerary, asm, "$Ra, $Rn, $Rm", "", []> {
+  bits<0> p;
   bits<4> Ra;
   bits<4> Rn;
   bits<4> Rm;
@@ -5873,6 +5913,7 @@ def t2AUT    : PACBTIHintSpaceUseInst<"aut", 0b00101101> {
   let hasSideEffects = 1;
 }
 
+// Thumb function call followed by BTI instruction.
 def ARMt2CallBTI : SDNode<"ARMISD::t2CALL_BTI", SDT_ARMcall,
                    [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index e2cc97b..5f5f703 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -28,11 +28,20 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
 
 def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
 
+// ARM VFP compare instruction, sets FPSCR.
 def arm_cmpfp   : SDNode<"ARMISD::CMPFP",    SDT_CMPFP>;
+
+// ARM VFP compare against zero instruction, sets FPSCR.
 def arm_cmpfp0  : SDNode<"ARMISD::CMPFPw0",  SDT_CMPFP0>;
+
+// ARM VFP signalling compare instruction, sets FPSCR.
 def arm_cmpfpe  : SDNode<"ARMISD::CMPFPE",   SDT_CMPFP>;
+
+// ARM VFP signalling compare against zero instruction, sets
+// FPSCR.
 def arm_cmpfpe0 : SDNode<"ARMISD::CMPFPEw0", SDT_CMPFP0>;
 
+// ARM fmstat instruction.
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",
   SDTypeProfile<1, 1, [
     SDTCisVT<0, FlagsVT>, // out flags
@@ -40,12 +49,19 @@ def arm_fmstat : SDNode<"ARMISD::FMSTAT",
   ]>
 >;
 
+// Two gprs to double.
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
+// double to two gprs.
 def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
+
+// move gpr to single, used for f32 literal constructed in a gpr
 def arm_vmovsr  : SDNode<"ARMISD::VMOVSR", SDT_VMOVSR>;
 
 def SDT_VMOVhr : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, i32>] >;
 def SDT_VMOVrh : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisFP<1>] >;
+
+// Move H <-> R, clearing top 16 bits
 def arm_vmovhr : SDNode<"ARMISD::VMOVhr", SDT_VMOVhr>;
 def arm_vmovrh : SDNode<"ARMISD::VMOVrh", SDT_VMOVrh>;
 
@@ -798,7 +814,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
 
 def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))),
               (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>;
-def : FP16Pat<(f16_to_fp GPR:$a),
+def : FP16Pat<(any_f16_to_fp GPR:$a),
               (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
 let hasSideEffects = 0, mayRaiseFPException = 1, Uses = [FPSCR_RM] in
@@ -810,7 +826,7 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda,
 
 def : FP16Pat<(f16 (any_fpround SPR:$Sm)),
               (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>;
-def : FP16Pat<(fp_to_f16 SPR:$a),
+def : FP16Pat<(any_fp_to_f16 SPR:$a),
               (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>;
 def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane),
               (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
@@ -875,7 +891,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
 def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))),
                   (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
-def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
+def : FP16Pat<(f64 (any_f16_to_fp GPR:$a)),
               (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>,
               Requires<[HasFPARMv8, HasDPVFP]>;
 
@@ -901,7 +917,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
 def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)),
                   (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
-def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
+def : FP16Pat<(any_fp_to_f16 (f64 DPR:$a)),
               (i32 (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$a), GPR))>,
                    Requires<[HasFPARMv8, HasDPVFP]>;
 
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index cd4299b..db37b76 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
           MRI->constrainRegClass(FirstReg, TRC);
           MRI->constrainRegClass(SecondReg, TRC);
 
@@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
   MachineFunction *MF = MI->getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const MCInstrDesc &MCID = TII->get(MI->getOpcode());
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp);
   MRI.constrainRegClass(NewBaseReg, TRC);
 
   int OldOffset = MI->getOperand(BaseOp + 1).getImm();
@@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
 
   const MCInstrDesc &MCID = TII->get(NewOpcode);
   // Constrain the def register class
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
   MRI.constrainRegClass(NewReg, TRC);
   // And do the same for the base operand
-  TRC = TII->getRegClass(MCID, 2, TRI);
+  TRC = TII->getRegClass(MCID, 2);
   MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
 
   unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index f5d6597..c040904 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -112,8 +112,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
+    assert(!MF->getSubtarget<ARMSubtarget>().genExecuteOnly() &&
+           "execute-only should not generate constant pools");
     MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_BlockAddress:
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 72eb3d0..b689760 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index bf7c962f..501dce9 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -10,9 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMSelectionDAGInfo.h"
 #include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Support/CommandLine.h"
+
+#define GET_SDNODE_DESC
+#include "ARMGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
@@ -30,9 +35,83 @@ static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
                           "Allow (may be subject to certain conditions) "
                           "conversion of memcpy to TP loop.")));
 
+ARMSelectionDAGInfo::ARMSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(ARMGenSDNodeInfo) {}
+
+const char *ARMSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+#define MAKE_CASE(V)                                                           \
+  case V:                                                                      \
+    return #V;
+
+  // These nodes don't have corresponding entries in *.td files yet.
+  switch (static_cast<ARMISD::NodeType>(Opcode)) {
+    MAKE_CASE(ARMISD::DYN_ALLOC)
+    MAKE_CASE(ARMISD::MVESEXT)
+    MAKE_CASE(ARMISD::MVEZEXT)
+    MAKE_CASE(ARMISD::MVETRUNC)
+    MAKE_CASE(ARMISD::BUILD_VECTOR)
+    MAKE_CASE(ARMISD::VLD1DUP)
+    MAKE_CASE(ARMISD::VLD2DUP)
+    MAKE_CASE(ARMISD::VLD3DUP)
+    MAKE_CASE(ARMISD::VLD4DUP)
+    MAKE_CASE(ARMISD::VLD1_UPD)
+    MAKE_CASE(ARMISD::VLD2_UPD)
+    MAKE_CASE(ARMISD::VLD3_UPD)
+    MAKE_CASE(ARMISD::VLD4_UPD)
+    MAKE_CASE(ARMISD::VLD1x2_UPD)
+    MAKE_CASE(ARMISD::VLD1x3_UPD)
+    MAKE_CASE(ARMISD::VLD1x4_UPD)
+    MAKE_CASE(ARMISD::VLD2LN_UPD)
+    MAKE_CASE(ARMISD::VLD3LN_UPD)
+    MAKE_CASE(ARMISD::VLD4LN_UPD)
+    MAKE_CASE(ARMISD::VLD1DUP_UPD)
+    MAKE_CASE(ARMISD::VLD2DUP_UPD)
+    MAKE_CASE(ARMISD::VLD3DUP_UPD)
+    MAKE_CASE(ARMISD::VLD4DUP_UPD)
+    MAKE_CASE(ARMISD::VST1_UPD)
+    MAKE_CASE(ARMISD::VST3_UPD)
+    MAKE_CASE(ARMISD::VST1x2_UPD)
+    MAKE_CASE(ARMISD::VST1x3_UPD)
+    MAKE_CASE(ARMISD::VST1x4_UPD)
+    MAKE_CASE(ARMISD::VST2LN_UPD)
+    MAKE_CASE(ARMISD::VST3LN_UPD)
+    MAKE_CASE(ARMISD::VST4LN_UPD)
+  }
+#undef MAKE_CASE
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
+}
+
 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= ARMISD::LAST_MEMORY_OPCODE;
+  // These nodes don't have corresponding entries in *.td files yet.
+  if (Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
+      Opcode <= ARMISD::LAST_MEMORY_OPCODE)
+    return true;
+
+  return SelectionDAGGenTargetInfo::isTargetMemoryOpcode(Opcode);
+}
+
+void ARMSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+                                           const SDNode *N) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ARMISD::WIN__DBZCHK:
+    // invalid number of results; expected 2, got 1
+  case ARMISD::WIN__CHKSTK:
+    // invalid number of results; expected 1, got 2
+  case ARMISD::COPY_STRUCT_BYVAL:
+    // invalid number of operands; expected 6, got 5
+  case ARMISD::MEMCPY:
+    // invalid number of operands; expected 5, got 4
+  case ARMISD::VMOVRRD:
+    // operand #0 must have type f64, but has type v1i64/v4f16/v8i8
+  case ARMISD::VMOVIMM:
+    // operand #0 must have type i32, but has type i16
+    return;
+  }
+
+  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
 }
 
 // Emit, if possible, a specialized version of the given Libcall. Typically this
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index d68150e..38d2a65 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -17,7 +17,62 @@
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "ARMGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace ARMISD {
+
+enum NodeType : unsigned {
+  DYN_ALLOC = GENERATED_OPCODE_END, // Dynamic allocation on the stack.
+
+  MVESEXT,  // Legalization aids for extending a vector into two/four vectors.
+  MVEZEXT,  //  or truncating two/four vectors into one. Eventually becomes
+  MVETRUNC, //  stack store/load sequence, if not optimized to anything else.
+
+  // Operands of the standard BUILD_VECTOR node are not legalized, which
+  // is fine if BUILD_VECTORs are always lowered to shuffles or other
+  // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+  // operands need to be legalized.  Define an ARM-specific version of
+  // BUILD_VECTOR for this purpose.
+  BUILD_VECTOR,
+
+  // Vector load N-element structure to all lanes:
+  FIRST_MEMORY_OPCODE,
+  VLD1DUP = FIRST_MEMORY_OPCODE,
+  VLD2DUP,
+  VLD3DUP,
+  VLD4DUP,
+
+  // NEON loads with post-increment base updates:
+  VLD1_UPD,
+  VLD2_UPD,
+  VLD3_UPD,
+  VLD4_UPD,
+  VLD2LN_UPD,
+  VLD3LN_UPD,
+  VLD4LN_UPD,
+  VLD1DUP_UPD,
+  VLD2DUP_UPD,
+  VLD3DUP_UPD,
+  VLD4DUP_UPD,
+  VLD1x2_UPD,
+  VLD1x3_UPD,
+  VLD1x4_UPD,
+
+  // NEON stores with post-increment base updates:
+  VST1_UPD,
+  VST3_UPD,
+  VST2LN_UPD,
+  VST3LN_UPD,
+  VST4LN_UPD,
+  VST1x2_UPD,
+  VST1x3_UPD,
+  VST1x4_UPD,
+  LAST_MEMORY_OPCODE = VST1x4_UPD,
+};
+
+} // namespace ARMISD
 
 namespace ARM_AM {
   static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
@@ -35,10 +90,17 @@ namespace ARM_AM {
   }
 }  // end namespace ARM_AM
 
-class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
+class ARMSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  ARMSelectionDAGInfo();
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
   bool isTargetMemoryOpcode(unsigned Opcode) const override;
 
+  void verifyTargetNode(const SelectionDAG &DAG,
+                        const SDNode *N) const override;
+
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
@@ -66,6 +128,6 @@ public:
                                  RTLIB::Libcall LC) const;
 };
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 58bc338..e6af32d 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -129,6 +129,76 @@ const RegisterBankInfo *ARMSubtarget::getRegBankInfo() const {
   return RegBankInfo.get();
 }
 
+void ARMSubtarget::initLibcallLoweringInfo(LibcallLoweringInfo &Info) const {
+  const Triple &TT = getTargetTriple();
+  if (TT.isOSBinFormatMachO()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (isThumb() && hasVFP2Base() && hasARMOps() && !useSoftFloat()) {
+      // clang-format off
+      static const struct {
+        const RTLIB::Libcall Op;
+        const RTLIB::LibcallImpl Impl;
+      } LibraryCalls[] = {
+        // Single-precision floating-point arithmetic.
+        { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
+        { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
+        { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
+        { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
+
+        // Double-precision floating-point arithmetic.
+        { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
+        { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
+        { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
+        { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
+
+        // Single-precision comparisons.
+        { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
+        { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
+        { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
+        { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
+        { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
+        { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
+        { RTLIB::UO_F32,  RTLIB::impl___unordsf2vfp },
+
+        // Double-precision comparisons.
+        { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
+        { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
+        { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
+        { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
+        { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
+        { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
+        { RTLIB::UO_F64,  RTLIB::impl___unorddf2vfp },
+
+        // Floating-point to integer conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
+        { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
+        { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
+        { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
+
+        // Conversions between floating types.
+        { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
+        { RTLIB::FPEXT_F32_F64,   RTLIB::impl___extendsfdf2vfp },
+
+        // Integer to floating-point conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+        // e.g., __floatunsidf vs. __floatunssidfvfp.
+        { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
+        { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
+        { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
+        { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
+      };
+      // clang-format on
+
+      for (const auto &LC : LibraryCalls)
+        Info.setLibcallImpl(LC.Op, LC.Impl);
+    }
+  }
+}
+
 bool ARMSubtarget::isXRaySupported() const {
   // We don't currently suppport Thumb, but Windows requires Thumb.
   return hasV6Ops() && hasARMOps() && !isTargetWindows();
@@ -309,26 +379,21 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 }
 
 bool ARMSubtarget::isROPI() const {
+  // FIXME: This should ideally come from a function attribute, to work
+  // correctly with LTO.
   return TM.getRelocationModel() == Reloc::ROPI ||
          TM.getRelocationModel() == Reloc::ROPI_RWPI;
 }
+
 bool ARMSubtarget::isRWPI() const {
+  // FIXME: This should ideally come from a function attribute, to work
+  // correctly with LTO.
   return TM.getRelocationModel() == Reloc::RWPI ||
          TM.getRelocationModel() == Reloc::ROPI_RWPI;
 }
 
 bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
-  if (!TM.shouldAssumeDSOLocal(GV))
-    return true;
-
-  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
-  // the section that is being relocated. This means we have to use o load even
-  // for GVs that are known to be local to the dso.
-  if (isTargetMachO() && TM.isPositionIndependent() &&
-      (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
-    return true;
-
-  return false;
+  return TM.isGVIndirectSymbol(GV);
 }
 
 bool ARMSubtarget::isGVInGOT(const GlobalValue *GV) const {
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 4a0883c..2a90f42 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -258,6 +258,7 @@ public:
   InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
+  void initLibcallLoweringInfo(LibcallLoweringInfo &Info) const override;
 
 private:
   ARMSelectionDAGInfo TSInfo;
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index c417c4c..1f74e9f 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -98,6 +98,20 @@ public:
     return true;
   }
 
+  bool isGVIndirectSymbol(const GlobalValue *GV) const {
+    if (!shouldAssumeDSOLocal(GV))
+      return true;
+
+    // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+    // the section that is being relocated. This means we have to use o load
+    // even for GVs that are known to be local to the dso.
+    if (getTargetTriple().isOSBinFormatMachO() && isPositionIndependent() &&
+        (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+      return true;
+
+    return false;
+  }
+
   yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
   yaml::MachineFunctionInfo *
   convertFuncInfoToYAML(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 24f58a6..88a7fb1 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -66,6 +66,11 @@ extern cl::opt<bool> EnableMaskedGatherScatters;
 
 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
 
+static cl::opt<int> ArmForceUnrollThreshold(
+    "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
+    cl::desc(
+        "Threshold for forced unrolling of small loops in Arm architecture"));
+
 /// Convert a vector load intrinsic into a simple llvm load instruction.
 /// This is beneficial when the underlying object being addressed comes
 /// from a constant, since we get constant-folding for free.
@@ -1125,7 +1130,8 @@ bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
 }
 
 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
-                                   unsigned /*AddressSpace*/) const {
+                                   unsigned /*AddressSpace*/,
+                                   TTI::MaskKind /*MaskKind*/) const {
   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
     return false;
 
@@ -1631,20 +1637,36 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 InstructionCost
-ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                                  unsigned AddressSpace,
+ARMTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
+                                     TTI::TargetCostKind CostKind) const {
+  switch (MICA.getID()) {
+  case Intrinsic::masked_scatter:
+  case Intrinsic::masked_gather:
+    return getGatherScatterOpCost(MICA, CostKind);
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_store:
+    return getMaskedMemoryOpCost(MICA, CostKind);
+  }
+  return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
+}
+
+InstructionCost
+ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                   TTI::TargetCostKind CostKind) const {
+  unsigned IID = MICA.getID();
+  Type *Src = MICA.getDataType();
+  Align Alignment = MICA.getAlignment();
+  unsigned AddressSpace = MICA.getAddressSpace();
   if (ST->hasMVEIntegerOps()) {
-    if (Opcode == Instruction::Load &&
+    if (IID == Intrinsic::masked_load &&
         isLegalMaskedLoad(Src, Alignment, AddressSpace))
       return ST->getMVEVectorCostFactor(CostKind);
-    if (Opcode == Instruction::Store &&
+    if (IID == Intrinsic::masked_store &&
         isLegalMaskedStore(Src, Alignment, AddressSpace))
       return ST->getMVEVectorCostFactor(CostKind);
   }
   if (!isa<FixedVectorType>(Src))
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
+    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
   // Scalar cost, which is currently very high due to the efficiency of the
   // generated code.
   return cast<FixedVectorType>(Src)->getNumElements() * 8;
@@ -1691,13 +1713,19 @@ InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
                                            UseMaskForCond, UseMaskForGaps);
 }
 
-InstructionCost ARMTTIImpl::getGatherScatterOpCost(
-    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
-    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
+InstructionCost
+ARMTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
+                                   TTI::TargetCostKind CostKind) const {
+
+  Type *DataTy = MICA.getDataType();
+  const Value *Ptr = MICA.getPointer();
+  bool VariableMask = MICA.getVariableMask();
+  Align Alignment = MICA.getAlignment();
+  const Instruction *I = MICA.getInst();
+
   using namespace PatternMatch;
   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
-    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                         Alignment, CostKind, I);
+    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
 
   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
   auto *VTy = cast<FixedVectorType>(DataTy);
@@ -2728,7 +2756,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
   // Force unrolling small loops can be very useful because of the branch
   // taken cost of the backedge.
-  if (Cost < 12)
+  if (Cost < ArmForceUnrollThreshold)
     UP.Force = true;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 0810c55..a232563 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -186,12 +186,16 @@ public:
 
   bool isProfitableLSRChainElement(Instruction *I) const override;
 
-  bool isLegalMaskedLoad(Type *DataTy, Align Alignment,
-                         unsigned AddressSpace) const override;
-
-  bool isLegalMaskedStore(Type *DataTy, Align Alignment,
-                          unsigned AddressSpace) const override {
-    return isLegalMaskedLoad(DataTy, Alignment, AddressSpace);
+  bool
+  isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace,
+                    TTI::MaskKind MaskKind =
+                        TTI::MaskKind::VariableOrConstantMask) const override;
+
+  bool
+  isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace,
+                     TTI::MaskKind MaskKind =
+                         TTI::MaskKind::VariableOrConstantMask) const override {
+    return isLegalMaskedLoad(DataTy, Alignment, AddressSpace, MaskKind);
   }
 
   bool forceScalarizeMaskedGather(VectorType *VTy,
@@ -275,20 +279,19 @@ public:
       const Instruction *I = nullptr) const override;
 
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
-                        TTI::TargetCostKind CostKind) const override;
+  getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
+                           TTI::TargetCostKind CostKind) const override;
+
+  InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+                                        TTI::TargetCostKind CostKind) const;
 
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;
 
-  InstructionCost
-  getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
-                         bool VariableMask, Align Alignment,
-                         TTI::TargetCostKind CostKind,
-                         const Instruction *I = nullptr) const override;
+  InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
+                                         TTI::TargetCostKind CostKind) const;
 
   InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index fa778ca..d99368e 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -6,8 +6,7 @@ tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler
-              -ignore-non-decodable-operands)
+tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM ARMGenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
@@ -15,6 +14,7 @@ tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM ARMGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM ARMGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM ARMGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM ARMGenSystemRegister.inc -gen-searchable-tables)
 
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index b119146..44f50dd 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -149,7 +149,7 @@ private:
                                    raw_ostream &CStream) const;
 
   bool isVectorPredicable(const MCInst &MI) const;
-  DecodeStatus AddThumbPredicate(MCInst&) const;
+  DecodeStatus checkThumbPredicate(MCInst &) const;
   void UpdateThumbPredicate(DecodeStatus &S, MCInst &MI) const;
 
   llvm::endianness InstructionEndianness;
@@ -618,6 +618,23 @@ static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
+// This overload is used to decode a `pred` operand that is not encoded into
+// instruction. This is the case for almost all predicable Thumb instructions
+// (exceptions are tBcc and t2Bcc). Some predicable Thumb instructions have ARM
+// equivalents where they are not predicable (always executed). This function
+// is used to decode `pred` operand of these ARM instructions, too.
+static DecodeStatus DecodePredicateOperand(MCInst &Inst,
+                                           const MCDisassembler *Decoder) {
+  const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+  unsigned CC = ARMCC::AL;
+  if (D->getSubtargetInfo().hasFeature(ARM::ModeThumb))
+    CC = D->ITBlock.getITCC();
+  MCRegister CondReg = CC == ARMCC::AL ? ARM::NoRegister : ARM::CPSR;
+  Inst.addOperand(MCOperand::createImm(CC));
+  Inst.addOperand(MCOperand::createReg(CondReg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder) {
@@ -1050,6 +1067,40 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
       if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
         return MCDisassembler::Fail;
       break;
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2L_OPTION:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2_OPTION:
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_PRE:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2L_OPTION:
+    case ARM::t2STC2L_POST:
+    case ARM::t2STC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2_OPTION:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_OPTION:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_PRE:
+      DecodePredicateOperand(Inst, Decoder);
+      break;
     default:
       break;
   }
@@ -1217,6 +1268,8 @@ static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn,
   // the only available operand), but LLVM expects the instruction to have one
   // operand, so we need to add the csync when decoding.
   Inst.addOperand(MCOperand::createImm(ARM_TSB::CSYNC));
+  if (Inst.getOpcode() == ARM::t2TSB)
+    DecodePredicateOperand(Inst, Decoder);
   return MCDisassembler::Success;
 }
 
@@ -1650,6 +1703,7 @@ static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
     if(imm > 4) return MCDisassembler::Fail;
     Inst.setOpcode(ARM::t2HINT);
     Inst.addOperand(MCOperand::createImm(imm));
+    DecodePredicateOperand(Inst, Decoder);
   }
 
   return S;
@@ -1675,6 +1729,7 @@ DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   Inst.setOpcode(Opcode);
   if (Opcode == ARM::t2HINT) {
     Inst.addOperand(MCOperand::createImm(imm));
+    DecodePredicateOperand(Inst, Decoder);
   }
 
   return MCDisassembler::Success;
@@ -1702,6 +1757,7 @@ static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
   if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(imm));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -1906,6 +1962,7 @@ static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(imm32));
 
+  DecodePredicateOperand(Inst, Decoder);
   return Status;
 }
 
@@ -2231,6 +2288,7 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
     break;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2502,6 +2560,7 @@ static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
       break;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2605,6 +2664,7 @@ static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
       !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2654,6 +2714,7 @@ static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
       return MCDisassembler::Fail;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2690,6 +2751,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
       return MCDisassembler::Fail;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2743,6 +2805,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
       return MCDisassembler::Fail;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2789,6 +2852,7 @@ static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
       break;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2861,6 +2925,7 @@ static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(8 << size));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2926,6 +2991,7 @@ static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -2951,6 +3017,7 @@ static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
   }
 
   Inst.addOperand(MCOperand::createImm(imm));
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3113,6 +3180,7 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
   }
   Inst.addOperand(MCOperand::createImm(imm));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3197,6 +3265,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3341,6 +3410,7 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
 
   if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
     return MCDisassembler::Fail;
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3449,6 +3519,7 @@ static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
 
   if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
     return MCDisassembler::Fail;
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3488,6 +3559,7 @@ static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
     return MCDisassembler::Fail;
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3678,6 +3750,7 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3690,6 +3763,7 @@ static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
   Inst.addOperand(MCOperand::createReg(ARM::SP));
   Inst.addOperand(MCOperand::createImm(imm));
 
+  DecodePredicateOperand(Inst, Decoder);
   return MCDisassembler::Success;
 }
 
@@ -3716,6 +3790,7 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
       return MCDisassembler::Fail;
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -3840,6 +3915,7 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4305,6 +4381,7 @@ static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4370,6 +4447,7 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4437,6 +4515,7 @@ static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4500,6 +4579,7 @@ static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4570,6 +4650,7 @@ static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4633,6 +4714,7 @@ static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4714,6 +4796,7 @@ static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4786,6 +4869,7 @@ static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(index));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4904,6 +4988,7 @@ static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4939,6 +5024,7 @@ static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -4965,6 +5051,7 @@ static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn, uint64_t Address,
       Val = -Val;
   }
   Inst.addOperand(MCOperand::createImm(Val));
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5062,6 +5149,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(64 - imm));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5121,6 +5209,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(64 - imm));
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5326,8 +5415,10 @@ static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
                                  const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  if (Inst.getOpcode() == ARM::MVE_LCTP)
+  if (Inst.getOpcode() == ARM::MVE_LCTP) {
+    DecodePredicateOperand(Inst, Decoder);
     return S;
+  }
 
   unsigned Imm = fieldFromInstruction(Insn, 11, 1) |
                  fieldFromInstruction(Insn, 1, 10) << 1;
@@ -5372,6 +5463,7 @@ static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
         Check(S, MCDisassembler::SoftFail); // an SBZ bit is wrong: soft fail
 
       Inst.setOpcode(ARM::MVE_LCTP);
+      DecodePredicateOperand(Inst, Decoder);
     } else {
       Inst.addOperand(MCOperand::createReg(ARM::LR));
       if (!Check(S, DecoderGPRRegisterClass(Inst,
@@ -5762,6 +5854,7 @@ static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5788,6 +5881,7 @@ static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5833,6 +5927,8 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
     if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
       return MCDisassembler::Fail;
 
+    DecodePredicateOperand(Inst, Decoder);
+
     if (fieldFromInstruction (Insn, 6, 3) != 4)
       return MCDisassembler::SoftFail;
 
@@ -5868,6 +5964,7 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
     Inst.addOperand(MCOperand::createImm(Saturate));
   }
 
+  DecodePredicateOperand(Inst, Decoder);
   return S;
 }
 
@@ -5971,10 +6068,12 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
   if (TypeT3) {
     Inst.setOpcode(sign1 ? ARM::t2SUBspImm12 : ARM::t2ADDspImm12);
     Inst.addOperand(MCOperand::createImm(Imm12)); // zext imm12
+    DecodePredicateOperand(Inst, Decoder);
   } else {
     Inst.setOpcode(sign1 ? ARM::t2SUBspImm : ARM::t2ADDspImm);
     if (!Check(DS, DecodeT2SOImm(Inst, Imm12, Address, Decoder))) // imm12
       return MCDisassembler::Fail;
+    DecodePredicateOperand(Inst, Decoder);
     if (!Check(DS, DecodeCCOutOperand(Inst, S, Address, Decoder))) // cc_out
       return MCDisassembler::Fail;
   }
@@ -5994,7 +6093,7 @@ static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   // An optional predicate, '$p' in the assembly.
-  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
+  DecodePredicateOperand(Inst, Decoder);
   // An immediate that represents a floating point registers list. '$regs' in
   // the assembly.
   Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
@@ -6115,28 +6214,17 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
     return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
   }
 
-  struct DecodeTable {
-    const uint8_t *P;
-    bool DecodePred;
-  };
-
-  const DecodeTable Tables[] = {
-      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
-      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
-      {DecoderTableNEONDup32, false},  {DecoderTablev8NEON32, false},
-      {DecoderTablev8Crypto32, false},
+  const uint8_t *Tables[] = {
+      DecoderTableVFP32,      DecoderTableVFPV832,
+      DecoderTableNEONData32, DecoderTableNEONLoadStore32,
+      DecoderTableNEONDup32,  DecoderTablev8NEON32,
+      DecoderTablev8Crypto32,
   };
 
-  for (auto Table : Tables) {
-    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+  for (const uint8_t *Table : Tables) {
+    Result = decodeInstruction(Table, MI, Insn, Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
-      // Add a fake predicate operand, because we share these instruction
-      // definitions with Thumb2 where these instructions are predicable.
-      if (Table.DecodePred && MCII->get(MI.getOpcode()).isPredicable()) {
-        MI.addOperand(MCOperand::createImm(ARMCC::AL));
-        MI.addOperand(MCOperand::createReg(ARM::NoRegister));
-      }
       return Result;
     }
   }
@@ -6161,18 +6249,16 @@ bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
   return false;
 }
 
-// Most Thumb instructions don't have explicit predicates in the
-// encoding, but rather get their predicates from IT context.  We need
-// to fix up the predicate operands using this context information as a
-// post-pass.
+// Most Thumb instructions don't have explicit predicates in the encoding,
+// but rather get their predicates from IT context. Here, we check that the
+// decoded instruction is allowed to have the decoded predicate and advance
+// IT/VPT block states.
 MCDisassembler::DecodeStatus
-ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
+ARMDisassembler::checkThumbPredicate(MCInst &MI) const {
   MCDisassembler::DecodeStatus S = Success;
 
   const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
 
-  // A few instructions actually have predicates encoded in them.  Don't
-  // try to overwrite it if we're seeing one of those.
   switch (MI.getOpcode()) {
     case ARM::tBcc:
     case ARM::t2Bcc:
@@ -6218,34 +6304,10 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
       (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
     S = SoftFail;
 
-  // If we're in an IT block, base the predicate on that.  Otherwise,
-  // assume a predicate of AL.
-  unsigned CC = ARMCC::AL;
-  if (ITBlock.instrInITBlock()) {
-    CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
     ITBlock.advanceITState();
-  } else if (VPTBlock.instrInVPTBlock()) {
+  else if (VPTBlock.instrInVPTBlock())
     VPTBlock.advanceVPTState();
-  }
-
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-
-  MCInst::iterator CCI = MI.begin();
-  for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) {
-    if (MCID.operands()[i].isPredicate() || CCI == MI.end())
-      break;
-  }
-
-  if (MCID.isPredicable()) {
-    CCI = MI.insert(CCI, MCOperand::createImm(CC));
-    ++CCI;
-    if (CC == ARMCC::AL)
-      MI.insert(CCI, MCOperand::createReg(ARM::NoRegister));
-    else
-      MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
-  } else if (CC != ARMCC::AL) {
-    Check(S, SoftFail);
-  }
 
   return S;
 }
@@ -6307,7 +6369,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 2;
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
     return Result;
   }
 
@@ -6315,7 +6377,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
                              STI);
   if (Result) {
     Size = 2;
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
     return Result;
   }
 
@@ -6329,7 +6391,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
     if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
       Result = MCDisassembler::SoftFail;
 
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
 
     // If we find an IT instruction, we need to parse its condition
     // code and mask operands so that we can apply them correctly
@@ -6367,7 +6429,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
     if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
       Result = MCDisassembler::SoftFail;
 
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
 
     if (isVPTOpcode(MI.getOpcode())) {
       unsigned Mask = MI.getOperand(0).getImm();
@@ -6381,7 +6443,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
     return Result;
   }
 
@@ -6389,7 +6451,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
     return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
   }
 
@@ -6428,7 +6490,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
-      Check(Result, AddThumbPredicate(MI));
+      Check(Result, checkThumbPredicate(MI));
       return Result;
     }
   }
@@ -6442,7 +6504,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
-      Check(Result, AddThumbPredicate(MI));
+      Check(Result, checkThumbPredicate(MI));
       return Result;
     }
 
@@ -6475,7 +6537,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
-    Check(Result, AddThumbPredicate(MI));
+    Check(Result, checkThumbPredicate(MI));
     return Result;
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 01fe13b..cc21844 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -428,7 +428,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   // signed 16bit range.
   if ((Kind == ARM::fixup_arm_movw_lo16 || Kind == ARM::fixup_arm_movt_hi16 ||
        Kind == ARM::fixup_t2_movw_lo16 || Kind == ARM::fixup_t2_movt_hi16) &&
-      (Addend < minIntN(16) || Addend > maxIntN(16))) {
+      !IsResolved && (Addend < minIntN(16) || Addend > maxIntN(16))) {
     Ctx.reportError(Fixup.getLoc(), "Relocation Not In Range");
     return 0;
   }
@@ -1238,7 +1238,7 @@ uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
   // Verify standard frame (lr/r7) was used.
   if (CFARegister != ARM::R7) {
     DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
-                                                   << CFARegister
+                                                   << CFARegister.id()
                                                    << " instead of r7\n");
     return CU::UNWIND_ARM_MODE_DWARF;
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index ca366ed..060d1f8 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -54,7 +54,7 @@ void ARMWinCOFFStreamer::emitWindowsUnwindTables() {
 }
 
 void ARMWinCOFFStreamer::finishImpl() {
-  emitFrames(nullptr);
+  emitFrames();
   emitWindowsUnwindTables();
 
   MCWinCOFFStreamer::finishImpl();
diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
index 8e1bf1d..eb237b4 100644
--- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -283,7 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
+  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b8c2fd..01f588f 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb1InstrInfo::getNop() const {
@@ -116,7 +116,6 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   assert((RC == &ARM::tGPRRegClass ||
@@ -142,10 +141,12 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void Thumb1InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
           (DestReg.isPhysical() && isARMLowRegister(DestReg))) &&
          "Unknown regclass!");
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 68b326c..289a30a 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -35,7 +35,7 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, Register DestReg, Register SrcReg,
@@ -43,14 +43,13 @@ public:
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index f5653d4..efb92c9 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden,
              cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb2InstrInfo::getNop() const {
@@ -165,7 +165,6 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -197,20 +196,22 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
-    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-    AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+    AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
     return;
   }
 
-  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI,
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC,
                                         Register());
 }
 
-void Thumb2InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb2InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -238,8 +239,8 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
-    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
     if (DestReg.isPhysical())
@@ -247,8 +248,7 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     return;
   }
 
-  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI,
-                                         Register());
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, Register());
 }
 
 void Thumb2InstrInfo::expandLoadStackGuard(
@@ -564,7 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   bool isSub = false;
 
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI);
+  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx);
 
   // Memory operands in inline assembly always use AddrModeT2_i12.
   if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 1b0bf2d..1e11cb3 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -44,21 +44,20 @@ public:
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   MachineInstr *optimizeSelect(MachineInstr &MI,
                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 12875c2..85e705d 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -66,8 +66,8 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
-  const Constant *C = ConstantInt::get(
-          Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
+  const Constant *C = ConstantInt::getSigned(
+      Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
@@ -85,8 +85,8 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
-  const Constant *C = ConstantInt::get(
-           Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
+  const Constant *C = ConstantInt::getSigned(
+      Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))