26 files changed, 285 insertions, 94 deletions
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 83350e6..9e81a4b 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -570,10 +570,7 @@ public:
   template <class F, class... ArgTys>
   static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
                           ArgTys const &...Args) {
-    assert(llvm::all_of(
-               std::initializer_list<unsigned>{Args.size()...},
-               [&Arg](auto const &BV) { return Arg.size() == BV; }) &&
-           "consistent sizes");
+    assert(((Arg.size() == Args.size()) && ...) && "consistent sizes");
     Out.resize(Arg.size());
     for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I)
       Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...);
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6de194d..6a943c5 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -253,9 +253,8 @@ public:
 
     OS << "\nOverall number of entries = " << OverallNumberOfEntries;
     OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
-    for (auto &BucketSize : BucketSizesMap)
-      OS << "\n Number of buckets with size " << BucketSize.first << ": "
-         << BucketSize.second;
+    for (auto [Size, Count] : BucketSizesMap)
+      OS << "\n Number of buckets with size " << Size << ": " << Count;
 
     std::stringstream stream;
     stream << std::fixed << std::setprecision(2)
diff --git a/llvm/include/llvm/ADT/DirectedGraph.h b/llvm/include/llvm/ADT/DirectedGraph.h
index 83c0bea..fb6b180 100644
--- a/llvm/include/llvm/ADT/DirectedGraph.h
+++ b/llvm/include/llvm/ADT/DirectedGraph.h
@@ -181,16 +181,6 @@ public:
 
   DirectedGraph() = default;
   explicit DirectedGraph(NodeType &N) : Nodes() { addNode(N); }
-  DirectedGraph(const DGraphType &G) : Nodes(G.Nodes) {}
-  DirectedGraph(DGraphType &&RHS) : Nodes(std::move(RHS.Nodes)) {}
-  DGraphType &operator=(const DGraphType &G) {
-    Nodes = G.Nodes;
-    return *this;
-  }
-  DGraphType &operator=(const DGraphType &&G) {
-    Nodes = std::move(G.Nodes);
-    return *this;
-  }
 
   const_iterator begin() const { return Nodes.begin(); }
   const_iterator end() const { return Nodes.end(); }
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index b7c3015..ed43f19 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -210,6 +210,13 @@ public:
   const_iterator end() const {
     return const_iterator(this, getNumSections(), 0);
   }
+
+  using VocabMap = std::map<std::string, Embedding>;
+  /// Parse a vocabulary section from JSON and populate the target vocabulary
+  /// map.
+  static Error parseVocabSection(StringRef Key,
+                                 const json::Value &ParsedVocabValue,
+                                 VocabMap &TargetVocab, unsigned &Dim);
 };
 
 /// Class for storing and accessing the IR2Vec vocabulary.
@@ -600,8 +607,6 @@ class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
 
   Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
                        VocabMap &ArgVocab);
-  Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
-                          VocabMap &TargetVocab, unsigned &Dim);
   void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
                             VocabMap &ArgVocab);
   void emitError(Error Err, LLVMContext &Ctx);
diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index be690a4..571caf9 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -59,14 +59,6 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type);
 /// True if the AllocTypes bitmask contains just a single type.
 LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes);
 
-/// Removes any existing "ambiguous" memprof attribute. Called before we apply a
-/// specific allocation type such as "cold", "notcold", or "hot".
-LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
-
-/// Adds an "ambiguous" memprof attribute to call with a matched allocation
-/// profile but that we haven't yet been able to disambiguate.
-LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
-
 /// Class to build a trie of call stack contexts for a particular profiled
 /// allocation call, along with their associated allocation types.
 /// The allocation will be at the root of the trie, which is then used to
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 15ff129..af218ba 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -613,6 +613,12 @@ LLVM_ABI bool isValidAssumeForContext(const Instruction *I,
                                       const DominatorTree *DT = nullptr,
                                       bool AllowEphemerals = false);
 
+/// Returns true, if no instruction between \p Assume and \p CtxI may free
+/// memory and the function is marked as NoSync. The latter ensures the current
+/// function cannot arrange for another thread to free on its behalf.
+LLVM_ABI bool willNotFreeBetween(const Instruction *Assume,
+                                 const Instruction *CtxI);
+
 enum class OverflowResult {
   /// Always overflows in the direction of signed/unsigned min value.
   AlwaysOverflowsLow,
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index c7304e3..e80c138 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -378,6 +378,8 @@ struct ScalarEnumerationTraits<TargetStackID::Value> {
     IO.enumCase(ID, "default", TargetStackID::Default);
     IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill);
     IO.enumCase(ID, "scalable-vector", TargetStackID::ScalableVector);
+    IO.enumCase(ID, "scalable-predicate-vector",
+                TargetStackID::ScalablePredicateVector);
     IO.enumCase(ID, "wasm-local", TargetStackID::WasmLocal);
     IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc);
   }
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 00c7343..50ce931 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -497,7 +497,18 @@ public:
   /// Should this stack ID be considered in MaxAlignment.
   bool contributesToMaxAlignment(uint8_t StackID) {
     return StackID == TargetStackID::Default ||
-           StackID == TargetStackID::ScalableVector;
+           StackID == TargetStackID::ScalableVector ||
+           StackID == TargetStackID::ScalablePredicateVector;
+  }
+
+  bool hasScalableStackID(int ObjectIdx) const {
+    uint8_t StackID = getStackID(ObjectIdx);
+    return isScalableStackID(StackID);
+  }
+
+  bool isScalableStackID(uint8_t StackID) const {
+    return StackID == TargetStackID::ScalableVector ||
+           StackID == TargetStackID::ScalablePredicateVector;
   }
 
   /// setObjectAlignment - Change the alignment of the specified stack object.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0b..62c0806 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1959,6 +1959,10 @@ public:
   LLVM_ABI SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
                                                 SDValue NewMemOp);
 
+  /// Get all the nodes in their topological order without modifying any states.
+  LLVM_ABI void getTopologicallyOrderedNodes(
+      SmallVectorImpl<const SDNode *> &SortedNodes) const;
+
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
   /// topological order. Returns the number of nodes.
@@ -2009,7 +2013,9 @@ public:
   /// function mirrors \c llvm::salvageDebugInfo.
   LLVM_ABI void salvageDebugInfo(SDNode &N);
 
-  LLVM_ABI void dump() const;
+  /// Dump the textual format of this DAG. Print nodes in sorted orders if \p
+  /// Sorted is true.
+  LLVM_ABI void dump(bool Sorted = false) const;
 
   /// In most cases this function returns the ABI alignment for a given type,
   /// except for illegal vector types where the alignment exceeds that of the
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 0e29e45..75696faf 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -32,6 +32,7 @@ enum Value {
   SGPRSpill = 1,
   ScalableVector = 2,
   WasmLocal = 3,
+  ScalablePredicateVector = 4,
   NoAlloc = 255
 };
 }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7bbad17..88691b9 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4654,23 +4654,6 @@ public:
     return false;
   }
 
-  /// Allows the target to handle physreg-carried dependency
-  /// in target-specific way. Used from the ScheduleDAGSDNodes to decide whether
-  /// to add the edge to the dependency graph.
-  /// Def - input: Selection DAG node defininfg physical register
-  /// User - input: Selection DAG node using physical register
-  /// Op - input: Number of User operand
-  /// PhysReg - inout: set to the physical register if the edge is
-  /// necessary, unchanged otherwise
-  /// Cost - inout: physical register copy cost.
-  /// Returns 'true' is the edge is necessary, 'false' otherwise
-  virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
-                                         const TargetRegisterInfo *TRI,
-                                         const TargetInstrInfo *TII,
-                                         MCRegister &PhysReg, int &Cost) const {
-    return false;
-  }
-
   /// Target-specific combining of register parts into its original value
   virtual SDValue
   joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index bf133f0..822245f 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -109,10 +109,15 @@ public:
     return MC->contains(Reg1.asMCReg(), Reg2.asMCReg());
   }
 
-  /// Return the cost of copying a value between two registers in this class.
-  /// A negative number means the register class is very expensive
-  /// to copy e.g. status flag register classes.
-  int getCopyCost() const { return MC->getCopyCost(); }
+  /// Return the cost of copying a value between two registers in this class. If
+  /// this is the maximum value, the register may be impossible to copy.
+  uint8_t getCopyCost() const { return MC->getCopyCost(); }
+
+  /// \return true if register class is very expensive to copy e.g. status flag
+  /// register classes.
+  bool expensiveOrImpossibleToCopy() const {
+    return MC->getCopyCost() == std::numeric_limits<uint8_t>::max();
+  }
 
   /// Return true if this register class may be used to create virtual
   /// registers.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
index 52af205..ffe0b50 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
@@ -179,6 +179,7 @@ public:
 
 class DWARFDataExtractorSimple
     : public DWARFDataExtractorBase<DWARFDataExtractorSimple> {
+public:
   using DWARFDataExtractorBase::DWARFDataExtractorBase;
 
   LLVM_ABI uint64_t getRelocatedValueImpl(uint32_t Size, uint64_t *Off,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 38f95a1..bba0d6e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -1333,6 +1333,9 @@ def OMP_Tile : Directive<[Spelling<"tile">]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Sizes, 51>,
   ];
+  let requiredClauses = [
+    VersionedClause<OMPC_Sizes, 51>,
+  ];
   let association = AS_Loop;
   let category = CA_Executable;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 0a11617..5331cb5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -4001,15 +4001,17 @@ public:
 
   /// Keeps track of value of iteration variable for input/scan loop to be
   /// used for Scan directive lowering
-  llvm::Value *IV;
+  llvm::Value *IV = nullptr;
 
   /// Stores the span of canonical loop being lowered to be used for temporary
   /// buffer allocation or Finalization.
-  llvm::Value *Span;
+  llvm::Value *Span = nullptr;
 
   ScanInfo() {
     ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
   }
+  ScanInfo(ScanInfo &) = delete;
+  ScanInfo &operator=(const ScanInfo &) = delete;
 
   ~ScanInfo() { delete (ScanBuffPtrs); }
 };
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index eb0440f..0622bfa 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -810,6 +810,26 @@ public:
   /// Whether the intrinsic is signed or unsigned.
   bool isSigned() const { return isSigned(getIntrinsicID()); };
 
+  /// Whether the intrinsic is a smin or umin.
+  static bool isMin(Intrinsic::ID ID) {
+    switch (ID) {
+    case Intrinsic::umin:
+    case Intrinsic::smin:
+      return true;
+    case Intrinsic::umax:
+    case Intrinsic::smax:
+      return false;
+    default:
+      llvm_unreachable("Invalid intrinsic");
+    }
+  }
+
+  /// Whether the intrinsic is a smin or a umin.
+  bool isMin() const { return isMin(getIntrinsicID()); }
+
+  /// Whether the intrinsic is a smax or a umax.
+  bool isMax() const { return !isMin(getIntrinsicID()); }
+
   /// Min/max intrinsics are monotonic, they operate on a fixed-bitwidth values,
   /// so there is a certain threshold value, upon reaching which,
   /// their value can no longer change. Return said threshold.
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7c9aef5..b0269ee 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -130,8 +130,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Expand_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Long_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
   class AdvSIMD_1IntArg_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Narrow_Intrinsic
@@ -150,9 +148,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2VectorArg_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
   class AdvSIMD_2Arg_FloatCompare_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
@@ -160,10 +155,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMExtendedType<0>, LLVMExtendedType<0>],
@@ -171,11 +162,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMExtendedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>],
@@ -184,10 +171,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, llvm_i32_ty],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty],
-                [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Lane_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
@@ -204,21 +187,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_3VectorArg_Scalar_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-               [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty,
-                LLVMMatchType<1>], [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty, llvm_i32_ty],
-                [IntrNoMem]>;
+               [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   class AdvSIMD_CvtFxToFP_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   class AdvSIMD_CvtFPToFx_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class AdvSIMD_1Arg_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
@@ -238,11 +213,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
-  class AdvSIMD_FML_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-
   class AdvSIMD_BF16FML_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
                 [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
@@ -251,7 +221,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
 // Arithmetic ops
 
-let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
+let TargetPrefix = "aarch64" in {
   // Vector Add Across Lanes
   def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
   def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 6168e24..2e31fe5 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2773,6 +2773,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
   return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
 }
 
+/// Matches MaskedStore Intrinsic.
+template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
+m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
+              const Opnd3 &Op3) {
+  return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2, Op3);
+}
+
 /// Matches MaskedGather Intrinsic.
 template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
 inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index aad3792..e6fc707 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -45,7 +45,7 @@ public:
   const uint16_t RegSetSize;
   const uint16_t ID;
   const uint16_t RegSizeInBits;
-  const int8_t CopyCost;
+  const uint8_t CopyCost;
   const bool Allocatable;
   const bool BaseClass;
 
@@ -94,7 +94,7 @@ public:
   /// getCopyCost - Return the cost of copying a value between two registers in
   /// this class. A negative number means the register class is very expensive
   /// to copy e.g. status flag register classes.
-  int getCopyCost() const { return CopyCost; }
+  uint8_t getCopyCost() const { return CopyCost; }
 
   /// isAllocatable - Return true if this register class may be used to create
   /// virtual registers.
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
new file mode 100644
index 0000000..6bee3b5
--- /dev/null
+++ b/llvm/include/llvm/Support/Jobserver.h
@@ -0,0 +1,162 @@
+//===- llvm/Support/Jobserver.h - Jobserver Client --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a client for the GNU Make jobserver protocol. This allows
+// LLVM tools to coordinate parallel execution with a parent `make` process.
+//
+// The jobserver protocol is a mechanism for GNU Make to share its pool of
+// available "job slots" with the subprocesses it invokes. This is particularly
+// useful for tools that can perform parallel operations themselves (e.g., a
+// multi-threaded linker or compiler). By participating in this protocol, a
+// tool can ensure the total number of concurrent jobs does not exceed the
+// limit specified by the user (e.g., `make -j8`).
+//
+// How it works:
+//
+// 1. Establishment:
+//    A child process discovers the jobserver by inspecting the `MAKEFLAGS`
+//    environment variable. If a jobserver is active, this variable will
+//    contain a `--jobserver-auth=<value>` argument. The format of `<value>`
+//    determines how to communicate with the server.
+//
+// 2. The Implicit Slot:
+//    Every command invoked by `make` is granted one "implicit" job slot. This
+//    means a tool can always perform at least one unit of work without needing
+//    to communicate with the jobserver. This implicit slot should NEVER be
+//    released back to the jobserver.
+//
+// 3. Acquiring and Releasing Slots:
+//    On POSIX systems, the jobserver is implemented as a pipe. The
+//    `--jobserver-auth` value specifies either a path to a named pipe
+//    (`fifo:PATH`) or a pair of file descriptors (`R,W`). The pipe is
+//    pre-loaded with single-character tokens, one for each available job slot.
+//
+//    - To acquire an additional slot, a client reads a single-character token
+//      from the pipe.
+//    - To release a slot, the client must write the *exact same* character
+//      token back to the pipe.
+//
+//    It is critical that a client releases all acquired slots before it exits,
+//    even in cases of error, to avoid deadlocking the build.
+//
+// Example:
+//    A multi-threaded linker invoked by `make -j8` wants to use multiple
+//    threads. It first checks for the jobserver. It knows it has one implicit
+//    slot, so it can use one thread. It then tries to acquire 7 more slots by
+//    reading 7 tokens from the jobserver pipe. If it only receives 3 tokens,
+//    it knows it can use a total of 1 (implicit) + 3 (acquired) = 4 threads.
+//    Before exiting, it must write the 3 tokens it read back to the pipe.
+//
+// For more context, see:
+//   - GNU Make manual on job slots:
+//     https://www.gnu.org/software/make/manual/html_node/Job-Slots.html
+//   - LLVM RFC discussion on jobserver support:
+//     https://discourse.llvm.org/t/rfc-adding-gnu-make-jobserver-
+//     support-to-llvm-for-coordinated-parallelism/87034
+//   - Ninja’s jobserver support PR:
+//     https://github.com/ninja-build/ninja/pull/2506
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_JOBSERVER_H
+#define LLVM_SUPPORT_JOBSERVER_H
+
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+
+/// A JobSlot represents a single job slot that can be acquired from or released
+/// to a jobserver pool. This class is move-only.
+class JobSlot {
+public:
+  /// Default constructor creates an invalid instance.
+  JobSlot() = default;
+
+  // Move operations are allowed.
+  JobSlot(JobSlot &&Other) noexcept : Value(Other.Value) {
+    Other.Value = kInvalidValue;
+  }
+  JobSlot &operator=(JobSlot &&Other) noexcept {
+    if (this != &Other) {
+      this->Value = Other.Value;
+      Other.Value = kInvalidValue;
+    }
+    return *this;
+  }
+
+  // Copy operations are disallowed.
+  JobSlot(const JobSlot &) = delete;
+  JobSlot &operator=(const JobSlot &) = delete;
+
+  /// Returns true if this instance is valid (either implicit or explicit).
+  bool isValid() const { return Value >= 0; }
+
+  /// Returns true if this instance represents the implicit job slot.
+  bool isImplicit() const { return Value == kImplicitValue; }
+
+  static JobSlot createExplicit(uint8_t V) {
+    return JobSlot(static_cast<int16_t>(V));
+  }
+
+  static JobSlot createImplicit() { return JobSlot(kImplicitValue); }
+
+  uint8_t getExplicitValue() const;
+  bool isExplicit() const { return isValid() && !isImplicit(); }
+
+private:
+  friend class JobserverClient;
+  friend class JobserverClientImpl;
+
+  JobSlot(int16_t V) : Value(V) {}
+
+  /// The jobserver pipe carries explicit tokens (bytes 0–255). We reserve two
+  /// sentinels in Value for special cases:
+  ///   kInvalidValue  (-1): no slot held
+  ///   kImplicitValue (INT16_MAX): implicit slot granted at startup (no pipe
+  ///   I/O)
+  ///
+  /// We use int16_t so Value can store 0–255 explicit tokens and
+  /// sentinels without overflow, enforces fixed 16-bit width, and avoids
+  /// unsigned/signed mix-ups.
+  static constexpr int16_t kInvalidValue = -1;
+  static constexpr int16_t kImplicitValue = INT16_MAX;
+  int16_t Value = kInvalidValue;
+};
+
+/// The public interface for a jobserver client.
+/// This client is a lazy-initialized singleton that is created on first use.
+class JobserverClient {
+public:
+  virtual ~JobserverClient();
+
+  /// Tries to acquire a job slot from the pool. On failure (e.g., if the pool
+  /// is empty), this returns an invalid JobSlot instance. The first successful
+  /// call will always return the implicit slot.
+  virtual JobSlot tryAcquire() = 0;
+
+  /// Releases a job slot back to the pool.
+  virtual void release(JobSlot Slot) = 0;
+
+  /// Returns the number of job slots available, as determined on first use.
+  /// This value is cached. Returns 0 if no jobserver is active.
+  virtual unsigned getNumJobs() const = 0;
+
+  /// Returns the singleton instance of the JobserverClient.
+  /// The instance is created on the first call to this function.
+  /// Returns a nullptr if no jobserver is configured or an error occurs.
+  static JobserverClient *getInstance();
+
+  /// Resets the singleton instance. For testing purposes only.
+  static void resetForTesting();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_JOBSERVER_H
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index c26681c..c20efc7 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
@@ -180,6 +181,7 @@ private:
   void grow(int requested);
 
   void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
+  void processTasksWithJobserver();
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
@@ -208,6 +210,8 @@ private:
 
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
+
+  JobserverClient *TheJobserver = nullptr;
 };
 #endif // LLVM_ENABLE_THREADS
 
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index d3fe0a5..8884680 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -142,6 +142,11 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     /// the thread shall remain on the actual CPU socket.
     LLVM_ABI std::optional<unsigned>
     compute_cpu_socket(unsigned ThreadPoolNum) const;
+
+    /// If true, the thread pool will attempt to coordinate with a GNU Make
+    /// jobserver, acquiring a job slot before processing a task. If no
+    /// jobserver is found in the environment, this is ignored.
+    bool UseJobserver = false;
   };
 
   /// Build a strategy from a number of threads as a string provided in \p Num.
@@ -210,6 +215,19 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     return S;
   }
 
+  /// Returns a thread strategy that attempts to coordinate with a GNU Make
+  /// jobserver. The number of active threads will be limited by the number of
+  /// available job slots. If no jobserver is detected in the environment, this
+  /// strategy falls back to the default hardware_concurrency() behavior.
+  inline ThreadPoolStrategy jobserver_concurrency() {
+    ThreadPoolStrategy S;
+    S.UseJobserver = true;
+    // We can still request all threads be created, as they will simply
+    // block waiting for a job slot if the jobserver is the limiting factor.
+    S.ThreadsRequested = 0; // 0 means 'use all available'
+    return S;
+  }
+
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 1e07fbe..faaff4a 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -18,8 +18,7 @@
 
 #include "llvm/Support/DataTypes.h"
 
-namespace llvm {
-namespace X86Disassembler {
+namespace llvm::X86Disassembler {
 
 #define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
 #define CONTEXTS_SYM x86DisassemblerContexts
@@ -541,7 +540,6 @@ static const unsigned X86_MAX_OPERANDS = 6;
 /// respectively.
 enum DisassemblerMode { MODE_16BIT, MODE_32BIT, MODE_64BIT };
 
-} // namespace X86Disassembler
-} // namespace llvm
+} // namespace llvm::X86Disassembler
 
 #endif
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index faf7788..e3f995d 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -126,7 +126,7 @@ def G_FRAME_INDEX : GenericInstruction {
 }
 
 def G_GLOBAL_VALUE : GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
+  let OutOperandList = (outs ptype0:$dst);
   let InOperandList = (ins unknown:$src);
   let hasSideEffects = false;
 }
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index f55bff1..d6a4059 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -377,6 +377,12 @@ class MCSchedPredicate<MCInstPredicate P> : SchedPredicateBase {
   SchedMachineModel SchedModel = ?;
 }
 
+// A scheduling predicate whose logic depends on a SubtargetFeature.
+class FeatureSchedPredicate<SubtargetFeature SF> : SchedPredicateBase {
+  SubtargetFeature Feature = SF;
+  SchedMachineModel SchedModel = ?;
+}
+
 // Define a predicate to determine which SchedVariant applies to a
 // particular MachineInstr. The code snippet is used as an
 // if-statement's expression. Available variables are MI, SchedModel,
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 2454149..74a4d6c 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -56,6 +56,7 @@ class OptimizationRemarkEmitter;
 class PHINode;
 class TargetLibraryInfo;
 class Value;
+class IntrinsicInst;
 /// A private "module" namespace for types and utilities used by GVN. These
 /// are implementation details and should not be used by clients.
 namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn {
@@ -349,6 +350,7 @@ private:
 
   // Helper functions of redundant load elimination.
   bool processLoad(LoadInst *L);
+  bool processMaskedLoad(IntrinsicInst *I);
   bool processNonLocalLoad(LoadInst *L);
   bool processAssumeIntrinsic(AssumeInst *II);