46 files changed, 539 insertions, 237 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 11d8294..e45d1f7 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1587,6 +1587,15 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns
+/// nullptr. \p A and \p B must have the same integer type.
+static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
+                                           ScalarEvolution &SE) {
+  if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B))
+    return SE.getMulExpr(A, B);
+  return nullptr;
+}
+
 /// Returns the absolute value of \p A. In the context of dependence analysis,
 /// we need an absolute value in a mathematical sense. If \p A is the signed
 /// minimum value, we cannot represent it unless extending the original type.
@@ -1686,7 +1695,11 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   assert(0 < Level && Level <= CommonLevels && "level out of range");
   Level--;
 
-  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE);
+  if (!Delta) {
+    Result.Consistent = false;
+    return false;
+  }
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta);
   LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
@@ -1702,7 +1715,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
     if (!AbsDelta || !AbsCoeff)
       return false;
-    const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
+    const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE);
+    if (!Product)
+      return false;
     return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
   }();
   if (IsDeltaLarge) {
diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp
index a83af4e..33e073b 100644
--- a/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/llvm/lib/Analysis/RegionPrinter.cpp
@@ -29,10 +29,9 @@ onlySimpleRegions("only-simple-regions",
                   cl::Hidden,
                   cl::init(false));
 
-namespace llvm {
-
-std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
-                                                       RegionNode *Graph) {
+std::string
+llvm::DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
+                                                 RegionNode *Graph) {
   if (!Node->isSubRegion()) {
     BasicBlock *BB = Node->getNodeAs<BasicBlock>();
 
@@ -46,7 +45,8 @@ std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
 }
 
 template <>
-struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
+struct llvm::DOTGraphTraits<RegionInfo *>
+    : public llvm::DOTGraphTraits<RegionNode *> {
 
   DOTGraphTraits (bool isSimple = false)
     : DOTGraphTraits<RegionNode*>(isSimple) {}
@@ -125,7 +125,6 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
     printRegionCluster(*G->getTopLevelRegion(), GW, 4);
   }
 };
-} // end namespace llvm
 
 namespace {
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 713277d..3aa245b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2087,6 +2087,17 @@ void AsmPrinter::emitFunctionBody() {
         // This is only used to influence register allocation behavior, no
         // actual initialization is needed.
         break;
+      case TargetOpcode::RELOC_NONE: {
+        // Generate a temporary label for the current PC.
+        MCSymbol *Sym = OutContext.createTempSymbol("reloc_none");
+        OutStreamer->emitLabel(Sym);
+        const MCExpr *Dot = MCSymbolRefExpr::create(Sym, OutContext);
+        const MCExpr *Value = MCSymbolRefExpr::create(
+            OutContext.getOrCreateSymbol(MI.getOperand(0).getSymbolName()),
+            OutContext);
+        OutStreamer->emitRelocDirective(*Dot, "BFD_RELOC_NONE", Value, SMLoc());
+        break;
+      }
       default:
         emitInstruction(&MI);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index b3c3125..7be7468 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -292,7 +292,8 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
   SmallVector<EVT, 4> SplitVTs;
-  ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0);
+  ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, /*MemVTs=*/nullptr, Offsets,
+                  0);
 
   if (SplitVTs.size() == 0)
     return;
@@ -996,7 +997,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
 
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
 
   assert(VRegs.size() == SplitVTs.size());
 
@@ -1028,7 +1029,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
 
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
 
   assert(VRegs.size() == SplitVTs.size());
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index be1b51f..4f6a19f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2686,6 +2686,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:
     return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder);
+  case Intrinsic::reloc_none: {
+    Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(0))->getMetadata();
+    StringRef SymbolName = cast<MDString>(MD)->getString();
+    MIRBuilder.buildInstr(TargetOpcode::RELOC_NONE)
+        .addExternalSymbol(SymbolName.data());
+    return true;
+  }
   }
   return false;
 }
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81..434a579 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
       MemOperands.push_back(MemOp);
       if (Token.isNewlineOrEOF())
         break;
+      if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+        break;
       if (Token.isNot(MIToken::comma))
         return error("expected ',' before the next machine memory operand");
       lex();
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245..37e5c51 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const MachineInstr &Other,
 
   // Check each pair of memory operands from both instructions, which can't
   // alias only if all pairs won't alias.
-  for (auto *MMOa : memoperands())
-    for (auto *MMOb : Other.memoperands())
+  for (auto *MMOa : memoperands()) {
+    for (auto *MMOb : Other.memoperands()) {
+      if (!MMOa->isStore() && !MMOb->isStore())
+        continue;
       if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
         return true;
+    }
+  }
 
   return false;
 }
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index f4c1a8b..fa654f2 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -143,6 +143,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
   SmallVector<std::pair<Register, Register>> TiedOperands;
+  SmallVector<MachineInstr *> MemMIs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -206,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       MIB.setMIFlag(MachineInstr::FrameSetup);
     if (MII->getFlag(MachineInstr::FrameDestroy))
       MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+    if (MII->mayLoadOrStore())
+      MemMIs.push_back(&*MII);
   }
 
   for (Register Reg : LocalDefs) {
@@ -231,6 +235,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     assert(UseIdx < ExternUses.size());
     MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
+
+  MIB->cloneMergedMemRefs(MF, MemMIs);
 }
 
 /// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index e9ffa85..6b747f3 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -196,8 +196,6 @@ public:
   bool run();
 };
 
-constexpr Align SafeStack::StackAlignment;
-
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
   uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9961c98..2f598b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4758,7 +4758,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
   SmallVector<uint64_t, 4> Offsets;
   const Value *SrcV = I.getOperand(0);
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
-                  SrcV->getType(), ValueVTs, &Offsets, 0);
+                  SrcV->getType(), ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
@@ -4794,7 +4794,7 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
-                  ValueVTs, &Offsets, 0);
+                  ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
@@ -7811,6 +7811,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
 
+  case Intrinsic::reloc_none: {
+    Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
+    StringRef SymbolName = cast<MDString>(MD)->getString();
+    SDValue Ops[2] = {
+        getRoot(),
+        DAG.getTargetExternalSymbol(
+            SymbolName.data(), TLI.getProgramPointerTy(DAG.getDataLayout()))};
+    DAG.setRoot(DAG.getNode(ISD::RELOC_NONE, sdl, MVT::Other, Ops));
+    return;
+  }
+
   case Intrinsic::eh_exceptionpointer:
   case Intrinsic::eh_exceptioncode: {
     // Get the exception pointer vreg, copy from it, and resize it to fit.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 77377d3..d3e1628 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -472,6 +472,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::LIFETIME_END:               return "lifetime.end";
   case ISD::FAKE_USE:
     return "fake_use";
+  case ISD::RELOC_NONE:
+    return "reloc_none";
   case ISD::PSEUDO_PROBE:
     return "pseudoprobe";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 6c11c5b..8bc5d2f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2550,6 +2550,11 @@ void SelectionDAGISel::Select_FAKE_USE(SDNode *N) {
                        N->getOperand(1), N->getOperand(0));
 }
 
+void SelectionDAGISel::Select_RELOC_NONE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::RELOC_NONE, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(0));
+}
+
 void SelectionDAGISel::Select_FREEZE(SDNode *N) {
   // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
   // If FREEZE instruction is added later, the code below must be changed as
@@ -3325,6 +3330,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::FAKE_USE:
     Select_FAKE_USE(NodeToMatch);
     return;
+  case ISD::RELOC_NONE:
+    Select_RELOC_NONE(NodeToMatch);
+    return;
   case ISD::FREEZE:
     Select_FREEZE(NodeToMatch);
     return;
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414..b99e1c7 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,17 @@ void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
     // by SubRegB is compatible with RegA with no subregister. So regardless of
     // whether the dest oper writes a subreg, the source oper should not.
     MO.setSubReg(0);
+
+    // Update uses of RegB to uses of RegA inside the bundle.
+    if (MI->isBundle()) {
+      for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+        if (MO.isReg() && MO.getReg() == RegB) {
+          assert(MO.getSubReg() == 0 && SubRegB == 0 &&
+                 "tied subregister uses in bundled instructions not supported");
+          MO.setReg(RegA);
+        }
+      }
+    }
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index fafc325..a98e925 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -962,16 +962,29 @@ DIType *DIDerivedType::getClassType() const {
   assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
   return cast_or_null<DIType>(getExtraData());
 }
+
+// Helper function to extract ConstantAsMetadata from ExtraData,
+// handling extra data MDTuple unwrapping if needed.
+static ConstantAsMetadata *extractConstantMetadata(Metadata *ExtraData) {
+  Metadata *ED = ExtraData;
+  if (auto *Tuple = dyn_cast_or_null<MDTuple>(ED)) {
+    if (Tuple->getNumOperands() != 1)
+      return nullptr;
+    ED = Tuple->getOperand(0);
+  }
+  return cast_or_null<ConstantAsMetadata>(ED);
+}
+
 uint32_t DIDerivedType::getVBPtrOffset() const {
   assert(getTag() == dwarf::DW_TAG_inheritance);
-  if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *CM = extractConstantMetadata(getExtraData()))
     if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
       return static_cast<uint32_t>(CI->getZExtValue());
   return 0;
 }
 Constant *DIDerivedType::getStorageOffsetInBits() const {
   assert(getTag() == dwarf::DW_TAG_member && isBitField());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
@@ -980,13 +993,13 @@ Constant *DIDerivedType::getConstant() const {
   assert((getTag() == dwarf::DW_TAG_member ||
           getTag() == dwarf::DW_TAG_variable) &&
          isStaticMember());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
 Constant *DIDerivedType::getDiscriminantValue() const {
   assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 24f90bf..f1e473a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6013,6 +6013,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Check(cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue() < 2,
           "cache type argument to llvm.prefetch must be 0-1", Call);
     break;
+  case Intrinsic::reloc_none: {
+    Check(isa<MDString>(
+              cast<MetadataAsValue>(Call.getArgOperand(0))->getMetadata()),
+          "llvm.reloc.none argument must be a metadata string", &Call);
+    break;
+  }
   case Intrinsic::stackprotector:
     Check(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()),
           "llvm.stackprotector parameter #2 must resolve to an alloca.", Call);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 0208735..5498787 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1690,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
       IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
                 "Please update the reader as needed when a new field is added "
                 "or when indexed profile version gets bumped.");
 
@@ -1723,10 +1723,11 @@ size_t Header::size() const {
     // of the header, and byte offset of existing fields shouldn't change when
     // indexed profile version gets incremented.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
         "Please update the size computation below if a new field has "
         "been added to the header; for a version bump without new "
         "fields, add a case statement to fall through to the latest version.");
+  case 13ull:
   case 12ull:
     return 72;
   case 11ull:
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index a347351..0f15ca8 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -542,7 +542,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
   static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-                IndexedInstrProf::ProfVersion::Version12);
+                IndexedInstrProf::ProfVersion::Version13);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..f5081a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1248,7 +1248,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
     SmallVector<EVT, 16> ValueVTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+                    &Offsets, ArgOffset);
 
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145..6ce18ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   if (isLDSDMA(MIa) || isLDSDMA(MIb))
     return false;
 
+  if (MIa.isBundle() || MIb.isBundle())
+    return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 8c7bc2f..81303fa 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -97,7 +97,6 @@
 #define DEBUG_TYPE "bpf-abstract-member-access"
 
 namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::AmaAttr;
 uint32_t BPFCoreSharedInfo::SeqNum;
 
 Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 6e5520c..3c61216 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -803,26 +803,6 @@ SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   return getAddr(N, DAG);
 }
 
-const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((BPFISD::NodeType)Opcode) {
-  case BPFISD::FIRST_NUMBER:
-    break;
-  case BPFISD::RET_GLUE:
-    return "BPFISD::RET_GLUE";
-  case BPFISD::CALL:
-    return "BPFISD::CALL";
-  case BPFISD::SELECT_CC:
-    return "BPFISD::SELECT_CC";
-  case BPFISD::BR_CC:
-    return "BPFISD::BR_CC";
-  case BPFISD::Wrapper:
-    return "BPFISD::Wrapper";
-  case BPFISD::MEMCPY:
-    return "BPFISD::MEMCPY";
-  }
-  return nullptr;
-}
-
 static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
                              SelectionDAG &DAG, unsigned Flags) {
   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 5243d49..3d6e7c7 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -20,17 +20,6 @@
 
 namespace llvm {
 class BPFSubtarget;
-namespace BPFISD {
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  RET_GLUE,
-  CALL,
-  SELECT_CC,
-  BR_CC,
-  Wrapper,
-  MEMCPY
-};
-}
 
 class BPFTargetLowering : public TargetLowering {
 public:
@@ -39,9 +28,6 @@ public:
   // Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  // This method returns the name of a target specific DAG node.
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
   // This method decides whether folding a constant offset
   // with the given GlobalAddress is legal.
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 51c32b2..bdacf9c 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -41,14 +41,12 @@ def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
                              [SDNPHasChain, SDNPOutGlue]>;
 def BPFcallseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_BPFCallSeqEnd,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
-                             [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>;
 
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
 def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
-                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
-                              SDNPMayStore, SDNPMayLoad]>;
+                             [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!Subtarget->isLittleEndian()">;
 def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index d3b0c02..6a11ea6 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -27,10 +27,6 @@
 
 #define DEBUG_TYPE "bpf-preserve-di-type"
 
-namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::TypeIdAttr;
-} // namespace llvm
-
 using namespace llvm;
 
 namespace {
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
index 3e29e6c..0e6d35d 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -10,12 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "BPFSelectionDAGInfo.h"
 #include "BPFTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "BPFGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "bpf-selectiondag-info"
 
+BPFSelectionDAGInfo::BPFSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(BPFGenSDNodeInfo) {}
+
 SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
@@ -31,11 +39,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
   if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
     return SDValue();
 
-  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
-                    DAG.getConstant(CopyLen, dl, MVT::i64),
-                    DAG.getConstant(Alignment.value(), dl, MVT::i64));
-
-  return Dst.getValue(0);
+  return DAG.getNode(BPFISD::MEMCPY, dl, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(CopyLen, dl, MVT::i64),
+                     DAG.getConstant(Alignment.value(), dl, MVT::i64));
 }
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
index 79f05e5..7345d2d 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -15,10 +15,15 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "BPFGenSDNodeInfo.inc"
+
 namespace llvm {
 
-class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+class BPFSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  BPFSelectionDAGInfo();
+
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
@@ -27,9 +32,8 @@ public:
                                   MachinePointerInfo SrcPtrInfo) const override;
 
   unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
-
 };
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 3678f13..fa539a0 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM BPFGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM BPFGenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM BPFGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d507d71..9f1616f 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -304,40 +304,76 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   GEPOperator *GOp = cast<GEPOperator>(&GEPI);
   Value *PtrOperand = GOp->getPointerOperand();
   Type *NewGEPType = GOp->getSourceElementType();
-  bool NeedsTransform = false;
 
   // Unwrap GEP ConstantExprs to find the base operand and element type
-  while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
-    if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
-      GOp = GEPCE;
-      PtrOperand = GEPCE->getPointerOperand();
-      NewGEPType = GEPCE->getSourceElementType();
-    } else
-      break;
+  while (auto *GEPCE = dyn_cast_or_null<GEPOperator>(
+             dyn_cast<ConstantExpr>(PtrOperand))) {
+    GOp = GEPCE;
+    PtrOperand = GEPCE->getPointerOperand();
+    NewGEPType = GEPCE->getSourceElementType();
   }
 
+  Type *const OrigGEPType = NewGEPType;
+  Value *const OrigOperand = PtrOperand;
+
   if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
     NewGEPType = NewGlobal->getValueType();
     PtrOperand = NewGlobal;
-    NeedsTransform = true;
   } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
     Type *AllocatedType = Alloca->getAllocatedType();
     if (isa<ArrayType>(AllocatedType) &&
-        AllocatedType != GOp->getResultElementType()) {
+        AllocatedType != GOp->getResultElementType())
       NewGEPType = AllocatedType;
-      NeedsTransform = true;
+  } else
+    return false; // Only GEPs into an alloca or global variable are considered
+
+  // Defer changing i8 GEP types until dxil-flatten-arrays
+  if (OrigGEPType->isIntegerTy(8))
+    NewGEPType = OrigGEPType;
+
+  // If the original type is a "sub-type" of the new type, then ensure the gep
+  // correctly zero-indexes the extra dimensions to keep the offset calculation
+  // correct.
+  // Eg:
+  //  i32, [4 x i32] and [8 x [4 x i32]] are sub-types of [8 x [4 x i32]], etc.
+  //
+  // So then:
+  //   gep [4 x i32] %idx
+  //     -> gep [8 x [4 x i32]], i32 0, i32 %idx
+  //   gep i32 %idx
+  //     -> gep [8 x [4 x i32]], i32 0, i32 0, i32 %idx
+  uint32_t MissingDims = 0;
+  Type *SubType = NewGEPType;
+
+  // The new type will be in its array version; so match accordingly.
+  Type *const GEPArrType = equivalentArrayTypeFromVector(OrigGEPType);
+
+  while (SubType != GEPArrType) {
+    MissingDims++;
+
+    ArrayType *ArrType = dyn_cast<ArrayType>(SubType);
+    if (!ArrType) {
+      assert(SubType == GEPArrType &&
+             "GEP uses an DXIL invalid sub-type of alloca/global variable");
+      break;
     }
+
+    SubType = ArrType->getElementType();
   }
 
+  bool NeedsTransform = OrigOperand != PtrOperand ||
+                        OrigGEPType != NewGEPType || MissingDims != 0;
+
   if (!NeedsTransform)
     return false;
 
-  // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
-  if (!isa<ArrayType>(GOp->getSourceElementType()))
-    NewGEPType = GOp->getSourceElementType();
-
   IRBuilder<> Builder(&GEPI);
-  SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
+  SmallVector<Value *, MaxVecSize> Indices;
+
+  for (uint32_t I = 0; I < MissingDims; I++)
+    Indices.push_back(Builder.getInt32(0));
+  llvm::append_range(Indices, GOp->indices());
+
   Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
                                     GOp->getName(), GOp->getNoWrapFlags());
 
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ebb7c26..e0d2dbd 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -197,6 +197,7 @@ static Value *expand16BitIsNormal(CallInst *Orig) {
 
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
+  case Intrinsic::assume:
   case Intrinsic::abs:
   case Intrinsic::atan2:
   case Intrinsic::exp:
@@ -988,6 +989,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
+  case Intrinsic::assume:
+    Orig->eraseFromParent();
+    return true;
   case Intrinsic::atan2:
     Result = expandAtan2Intrinsic(Orig);
     break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 8720460..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -904,8 +904,6 @@ public:
       case Intrinsic::dx_resource_casthandle:
       // NOTE: llvm.dbg.value is supported as is in DXIL.
       case Intrinsic::dbg_value:
-      // NOTE: llvm.assume is supported as is in DXIL.
-      case Intrinsic::assume:
       case Intrinsic::not_intrinsic:
         if (F.use_empty())
           F.eraseFromParent();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 47726d6..55bafde 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -4753,6 +4753,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const {
   return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0);
 }
 
+bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const {
+  return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32);
+}
+
 // Addressing mode relations.
 short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const {
   return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index c17e527..48adf82 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -532,6 +532,7 @@ public:
   }
 
   MCInst getNop() const override;
+  bool isQFPMul(const MachineInstr *MF) const;
 };
 
 /// \brief Create RegSubRegPair from a register MachineOperand
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index f29a739..8801f69 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -58,7 +58,7 @@
 // are PHI inst.
 //
 //===----------------------------------------------------------------------===//
-#include <unordered_set>
+
 #define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
 
 #include "Hexagon.h"
@@ -86,6 +86,9 @@ using namespace llvm;
 cl::opt<bool>
     DisableQFOptimizer("disable-qfp-opt", cl::init(false),
                        cl::desc("Disable optimization of Qfloat operations."));
+cl::opt<bool> DisableQFOptForMul(
+    "disable-qfp-opt-mul", cl::init(true),
+    cl::desc("Disable optimization of Qfloat operations for multiply."));
 
 namespace {
 const std::map<unsigned short, unsigned short> QFPInstMap{
@@ -101,11 +104,21 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
     {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
     {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
     {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
-    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32},
+    {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32},
+    {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16},
+    {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32},
+    {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16},
+    {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32},
+    {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}};
 } // namespace
 
-namespace {
+namespace llvm {
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+} // namespace llvm
 
+namespace {
 struct HexagonQFPOptimizer : public MachineFunctionPass {
 public:
   static char ID;
@@ -116,6 +129,10 @@ public:
 
   bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
 
+  bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
   StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -142,19 +159,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() {
 bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
                                       MachineBasicBlock *MBB) {
 
-  // Early exit:
-  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
-  // + 1 dest),
-  // - or does not have a transformation mapping.
-  if (MI->getNumOperands() < 3)
+  if (MI->getNumOperands() == 2)
+    return optimizeQfpOneOp(MI, MBB);
+  else if (MI->getNumOperands() == 3)
+    return optimizeQfpTwoOp(MI, MBB);
+  else
     return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+
+  unsigned Op0F = 0;
   auto It = QFPInstMap.find(MI->getOpcode());
   if (It == QFPInstMap.end())
     return false;
+
   unsigned short InstTy = It->second;
+  // Get the reachind defs of MI
+  MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+  MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump());
+  MachineInstr *ReachDefDef = nullptr;
+
+  // Get the reaching def of the reaching def to check for W reg def
+  if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() &&
+      DefMI->getOperand(1).getReg().isVirtual())
+    ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+  unsigned ReachDefOp = DefMI->getOpcode();
+  MachineInstrBuilder MIB;
+
+  // Check if the reaching def is a conversion
+  if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 ||
+      ReachDefOp == Hexagon::V6_vconv_hf_qf16) {
+
+    // Return if the reaching def of reaching def is W type
+    if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) ==
+                           &Hexagon::HvxWRRegClass)
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &SrcOp = DefMI->getOperand(1);
+    Op0F = getKillRegState(SrcOp.isKill());
+    SrcOp.setIsKill(false);
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+  return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
 
   unsigned Op0F = 0;
   unsigned Op1F = 0;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+  unsigned short InstTy = It->second;
   // Get the reaching defs of MI, DefMI1 and DefMI2
   MachineInstr *DefMI1 = nullptr;
   MachineInstr *DefMI2 = nullptr;
@@ -167,6 +234,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     return false;
 
   MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
   MachineInstr *Inst1 = nullptr;
   MachineInstr *Inst2 = nullptr;
   LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
@@ -185,7 +255,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
   unsigned Def2OP = DefMI2->getOpcode();
 
   MachineInstrBuilder MIB;
-  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+
+  // Check if the both the reaching defs of MI are qf to sf/hf conversions
   if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
        Def2OP == Hexagon::V6_vconv_sf_qf32) ||
       (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -226,7 +297,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if left operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
                Def2OP != Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -250,7 +321,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if right operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
                Def2OP == Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
@@ -258,13 +329,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
              !DefMI1->isPHI() &&
              (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
     // The second operand of original instruction is converted.
-    // In "mix" instructions, "qf" operand is always the first operand.
-
-    // Caveat: vsub is not commutative w.r.t operands.
-    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
-        InstTy == Hexagon::V6_vsub_qf32_mix)
-      return false;
-
     if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
                      &Hexagon::HvxWRRegClass)
       return false;
@@ -275,10 +339,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     Op1F = getKillRegState(Src2.isKill());
     Src2.setIsKill(false);
     Op0F = getKillRegState(Src1.isKill());
-    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
-              .addReg(Src2.getReg(), Op1F,
-                      Src2.getSubReg()) // Notice the operands are flipped.
-              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix) {
+      if (!HST->useHVXV81Ops())
+        // vsub_(hf|sf)_mix insts are only avlbl on hvx81+
+        return false;
+      // vsub is not commutative w.r.t. operands -> treat it as a special case
+      // to choose the correct mix instruction.
+      if (Def2OP == Hexagon::V6_vconv_sf_qf32)
+        InstTy = Hexagon::V6_vsub_sf_mix;
+      else if (Def2OP == Hexagon::V6_vconv_hf_qf16)
+        InstTy = Hexagon::V6_vsub_hf_mix;
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+                .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    } else {
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src2.getReg(), Op1F,
+                        Src2.getSubReg()) // Notice the operands are flipped.
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    }
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
   }
@@ -309,15 +389,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
     while (MII != MBBI->instr_end()) {
       MachineInstr *MI = &*MII;
       ++MII; // As MI might be removed.
-
-      if (QFPInstMap.count(MI->getOpcode()) &&
-          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
-          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
-        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
-        if (optimizeQfp(MI, MBB)) {
-          MI->eraseFromParent();
-          LLVM_DEBUG(dbgs() << "\t....Removing....");
-          Changed = true;
+      if (QFPInstMap.count(MI->getOpcode())) {
+        auto OpC = MI->getOpcode();
+        if (DisableQFOptForMul && HII->isQFPMul(MI))
+          continue;
+        if (OpC != Hexagon::V6_vconv_sf_qf32 &&
+            OpC != Hexagon::V6_vconv_hf_qf16) {
+          LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+          if (optimizeQfp(MI, MBB)) {
+            MI->eraseFromParent();
+            LLVM_DEBUG(dbgs() << "\t....Removing....");
+            Changed = true;
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2f1a7ad..a3deb36 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -305,7 +305,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
-  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
+                  StartingOffset);
 
   for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
     MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 780e124..122738c 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2750,6 +2750,10 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
     return;
 
+  // Ignore non-emitted data.
+  if (GV->getSection() == "llvm.metadata")
+    return;
+
   // If the Global Variable has the toc-data attribute, it needs to be emitted
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b37b740..f881c4c 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -789,6 +789,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
+    uint64_t CFAAdjust = RealStackSize - Offset;
+
     uint64_t CurrentOffset = 0;
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -802,7 +804,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
       CurrentOffset += ProbeSize;
       if (EmitCFI)
-        CFIBuilder.buildDefCFAOffset(CurrentOffset);
+        CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
     }
 
     uint64_t Residual = Offset - CurrentOffset;
@@ -810,7 +812,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
                     StackOffset::getFixed(-Residual), Flag, getStackAlign());
       if (EmitCFI)
-        CFIBuilder.buildDefCFAOffset(Offset);
+        CFIBuilder.buildDefCFAOffset(RealStackSize);
 
       if (DynAllocation) {
         // s[d|w] zero, 0(sp)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 995ae75..3b69eda 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17867,6 +17867,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
 
   SmallVector<SDNode *> Worklist;
   SmallPtrSet<SDNode *, 8> Inserted;
+  SmallPtrSet<SDNode *, 8> ExtensionsToRemove;
   Worklist.push_back(N);
   Inserted.insert(N);
   SmallVector<CombineResult> CombinesToApply;
@@ -17876,22 +17877,25 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
 
     NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
     NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
-    auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
-                                &Inserted](const NodeExtensionHelper &Op) {
-      if (Op.needToPromoteOtherUsers()) {
-        for (SDUse &Use : Op.OrigOperand->uses()) {
-          SDNode *TheUser = Use.getUser();
-          if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
-            return false;
-          // We only support the first 2 operands of FMA.
-          if (Use.getOperandNo() >= 2)
-            return false;
-          if (Inserted.insert(TheUser).second)
-            Worklist.push_back(TheUser);
-        }
-      }
-      return true;
-    };
+    auto AppendUsersIfNeeded =
+        [&Worklist, &Subtarget, &Inserted,
+         &ExtensionsToRemove](const NodeExtensionHelper &Op) {
+          if (Op.needToPromoteOtherUsers()) {
+            // Remember that we're supposed to remove this extension.
+            ExtensionsToRemove.insert(Op.OrigOperand.getNode());
+            for (SDUse &Use : Op.OrigOperand->uses()) {
+              SDNode *TheUser = Use.getUser();
+              if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
+                return false;
+              // We only support the first 2 operands of FMA.
+              if (Use.getOperandNo() >= 2)
+                return false;
+              if (Inserted.insert(TheUser).second)
+                Worklist.push_back(TheUser);
+            }
+          }
+          return true;
+        };
 
     // Control the compile time by limiting the number of node we look at in
     // total.
@@ -17912,6 +17916,15 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
         std::optional<CombineResult> Res =
             FoldingStrategy(Root, LHS, RHS, DAG, Subtarget);
         if (Res) {
+          // If this strategy wouldn't remove an extension we're supposed to
+          // remove, reject it.
+          if (!Res->LHSExt.has_value() &&
+              ExtensionsToRemove.contains(LHS.OrigOperand.getNode()))
+            continue;
+          if (!Res->RHSExt.has_value() &&
+              ExtensionsToRemove.contains(RHS.OrigOperand.getNode()))
+            continue;
+
           Matched = true;
           CombinesToApply.push_back(*Res);
           // All the inputs that are extended need to be folded, otherwise
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 24ebbc3..41071b2 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -654,8 +654,17 @@ foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+    defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+    let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+
+      // Pattern for vredsum:                  5/5/5/7/11/19/35
+      // Pattern for vredand, vredor, vredxor: 4/4/4/6/10/18/34
+      // They are grouped together, so we use the worst-case vredsum latency.
+      // TODO: split vredand, vredor, vredxor into separate scheduling classe.
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -663,7 +672,27 @@ foreach mx = SchedMxListWRed in {
   foreach sew = SchedSEWSet<mx, 0, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+    defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+    let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    // Latency for vfredmax.vs, vfredmin.vs: 12/12/15/21/33/57
+    // Latency for vfredusum.vs is slightly lower for e16/e32
+    // We use the worst-case
+    defvar VFRedLat = GetLMULValue<[12, 12, 12, 15, 21, 33, 57], mx>.c;
+    defvar VFRedOcc = GetLMULValue<[8, 8, 8, 8, 14, 20, 57], mx>.c;
+    let Latency = VFRedLat, ReleaseAtCycles = [VFRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -671,9 +700,20 @@ foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    // Compute latency based on SEW
+    defvar VFRedOV_FromLat = !cond(
+      !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 12, mx>.c,
+      !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c,
+      !eq(sew, 64) : ConstValueUntilLMULThenDouble<"M1", 12, mx>.c
+    );
+    defvar VFRedOV_FromOcc = !cond(
+      !eq(sew, 16) : GetLMULValue<[8, 8, 20, 24, 48, 96, 384], mx>.c,
+      !eq(sew, 32) : GetLMULValue<[8, 8,  8, 12, 24, 48, 192], mx>.c,
+      !eq(sew, 64) : GetLMULValue<[6, 6,  6,  6, 12, 24,  96], mx>.c
+    );
+    let Latency = VFRedOV_FromLat, ReleaseAtCycles = [VFRedOV_FromOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -681,8 +721,18 @@ foreach mx = SchedMxListFWRed in {
   foreach sew = SchedSEWSet<mx, 1, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defvar VFRedOVLat = !cond(
+      !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 16, mx>.c,
+      !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 16, mx>.c,
+    );
+    defvar VFRedOVOcc = !cond(
+      !eq(sew, 16) : GetLMULValue<[11, 11, 27, 32, 64, 128, 512], mx>.c,
+      !eq(sew, 32) : GetLMULValue<[11, 11, 11, 16, 32,  64, 256], mx>.c,
+    );
+    let Latency = VFRedOVLat, ReleaseAtCycles = [VFRedOVOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 168e041..d103953 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53354,6 +53354,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
 // i32 sub value.
 static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                               SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
   SDValue StoredVal = St->getValue();
@@ -53451,6 +53452,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   if (!StoredVal.hasOneUse()) {
     SDValue NewLoad =
         DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+    for (SDNode *User : StoredVal->users())
+      DCI.AddToWorklist(User);
     DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
   }
   return NewStore;
@@ -53682,7 +53685,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+  if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget))
     return R;
 
   // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 975a271..96bef0e 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -174,8 +174,8 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1153"},   {"gfx1153"}, GK_GFX1153, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
-    {{"gfx1250"},   {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
-    {{"gfx1251"},   {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+    {{"gfx1250"},   {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
+    {{"gfx1251"},   {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
 
     {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
     {{"gfx10-1-generic"},   {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af53fa0..02f06be 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
 
   // Reserve bit 60-63 for other information purpose.
-  FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+  FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK;
   if (IsCS)
     NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
   LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 42b1fdf..8aa8aa2 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -39,36 +39,36 @@ using namespace llvm;
 STATISTIC(NumBroken, "Number of blocks inserted");
 
 namespace {
-  struct BreakCriticalEdges : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BreakCriticalEdges() : FunctionPass(ID) {
-      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
-    }
+struct BreakCriticalEdges : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BreakCriticalEdges() : FunctionPass(ID) {
+    initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override {
-      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  bool runOnFunction(Function &F) override {
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
 
-      auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
-      auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
 
-      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
-      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-      unsigned N =
-          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
-      NumBroken += N;
-      return N > 0;
-    }
+    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+    auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+    unsigned N = SplitAllCriticalEdges(
+        F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+    NumBroken += N;
+    return N > 0;
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
 
-      // No loop canonicalization guarantees are broken by this pass.
-      AU.addPreservedID(LoopSimplifyID);
-    }
-  };
-}
+    // No loop canonicalization guarantees are broken by this pass.
+    AU.addPreservedID(LoopSimplifyID);
+  }
+};
+} // namespace
 
 char BreakCriticalEdges::ID = 0;
 INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
@@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
 
 // Publicly exposed interface to pass...
 char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+
 FunctionPass *llvm::createBreakCriticalEdgesPass() {
   return new BreakCriticalEdges();
 }
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7343c79..9f6d89e 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -40,22 +40,22 @@ using namespace llvm;
 
 namespace {
 
-  struct QuotRemPair {
-    Value *Quotient;
-    Value *Remainder;
-
-    QuotRemPair(Value *InQuotient, Value *InRemainder)
-        : Quotient(InQuotient), Remainder(InRemainder) {}
-  };
-
-  /// A quotient and remainder, plus a BB from which they logically "originate".
-  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
-  /// corresponding predecessor.
-  struct QuotRemWithBB {
-    BasicBlock *BB = nullptr;
-    Value *Quotient = nullptr;
-    Value *Remainder = nullptr;
-  };
+struct QuotRemPair {
+  Value *Quotient;
+  Value *Remainder;
+
+  QuotRemPair(Value *InQuotient, Value *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+};
+
+/// A quotient and remainder, plus a BB from which they logically "originate".
+/// If you use Quotient or Remainder in a Phi node, you should use BB as its
+/// corresponding predecessor.
+struct QuotRemWithBB {
+  BasicBlock *BB = nullptr;
+  Value *Quotient = nullptr;
+  Value *Remainder = nullptr;
+};
 
 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 61ffb49..8da6a980 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
     if (P != Preheader) BackedgeBlocks.push_back(P);
   }
 
-  // Create and insert the new backedge block...
+  // Create and insert the new backedge block.
   BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
                                            Header->getName() + ".backedge", F);
   BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
@@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
 }
 
 namespace {
-  struct LoopSimplify : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    LoopSimplify() : FunctionPass(ID) {
-      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
-    }
+struct LoopSimplify : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LoopSimplify() : FunctionPass(ID) {
+    initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override;
+  bool runOnFunction(Function &F) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
 
-      // We need loop information to identify the loops...
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
+    // We need loop information to identify the loops.
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
 
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
 
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.addPreservedID(LCSSAID);
-      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
-      AU.addPreserved<BranchProbabilityInfoWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addPreservedID(LCSSAID);
+    AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
+    AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+  }
 
-    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
-    void verifyAnalysis() const override;
-  };
-}
+  /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+  void verifyAnalysis() const override;
+};
+} // namespace
 
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
@@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
                     false, false)
 
-// Publicly exposed interface to pass...
+// Publicly exposed interface to pass.
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
 /// runOnFunction - Run down all loops in the CFG (recursively, but we could do
-/// it in any convenient order) inserting preheaders...
+/// it in any convenient order) inserting preheaders.
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 906fa2f..b7224a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7933,6 +7933,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
         (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
       ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
   }
+
+  // Check that all partial reductions in a chain are only used by other
+  // partial reductions with the same scale factor. Otherwise we end up creating
+  // users of scaled reductions where the types of the other operands don't
+  // match.
+  for (const auto &[Chain, Scale] : PartialReductionChains) {
+    auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
+      auto *UI = cast<Instruction>(U);
+      if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
+        return all_of(UI->users(), [ScaleVal, this](const User *U) {
+          auto *UI = cast<Instruction>(U);
+          return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
+        });
+      }
+      return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
+             !OrigLoop->contains(UI->getParent());
+    };
+    if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
+      ScaledReductionMap.erase(Chain.Reduction);
+  }
 }
 
 bool VPRecipeBuilder::getScaledReductions(
@@ -8116,11 +8136,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
-    if (auto PartialRed =
-            tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
-      return PartialRed;
-  }
+  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
+    return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8154,9 +8171,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
       isa<VPPartialReductionRecipe>(BinOpRecipe))
     std::swap(BinOp, Accumulator);
 
-  if (ScaleFactor !=
-      vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
-    return nullptr;
+  assert(ScaleFactor ==
+             vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
+         "all accumulators in chain must have same scale factor");
 
   unsigned ReductionOpcode = Reduction->getOpcode();
   if (ReductionOpcode == Instruction::Sub) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bf3f52c..df835a0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20996,6 +20996,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         return false;
       }))
     return std::nullopt;
+  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+      EI.UserTE->hasCopyableElements() &&
+      EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
+      all_of(VL, [&](Value *V) {
+        if (S.isCopyableElement(V))
+          return true;
+        return isUsedOutsideBlock(V);
+      }))
+    return std::nullopt;
   bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
        all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {