28 files changed, 200 insertions, 201 deletions
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ab536ad..05379f4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -123,6 +123,9 @@ AST Dumping Potentially Breaking Changes
 
     ``__atomic_test_and_set(p, 0)``
 
+- Pretty-printing of templates with inherited (i.e. specified in a previous
+  redeclaration) default arguments has been fixed.
+
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
 - Members of anonymous unions/structs are now injected as ``IndirectFieldDecl``
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 196057f..7001ade 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1894,7 +1894,7 @@ void DeclPrinter::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *TTP) {
       Out << TTP->getDeclName();
   }
 
-  if (TTP->hasDefaultArgument()) {
+  if (TTP->hasDefaultArgument() && !TTP->defaultArgumentWasInherited()) {
     Out << " = ";
     TTP->getDefaultArgument().getArgument().print(Policy, Out,
                                                   /*IncludeType=*/false);
@@ -1909,7 +1909,7 @@ void DeclPrinter::VisitNonTypeTemplateParmDecl(
         Policy.CleanUglifiedParameters ? II->deuglifiedName() : II->getName();
   printDeclType(NTTP->getType(), Name, NTTP->isParameterPack());
 
-  if (NTTP->hasDefaultArgument()) {
+  if (NTTP->hasDefaultArgument() && !NTTP->defaultArgumentWasInherited()) {
     Out << " = ";
     NTTP->getDefaultArgument().getArgument().print(Policy, Out,
                                                    /*IncludeType=*/false);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index dcf2876..419f3e1 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3822,14 +3822,19 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
         AliasTemplate->getTemplateParameters()->getDepth());
 
     LocalInstantiationScope Scope(*this);
-    InstantiatingTemplate Inst(
-        *this, /*PointOfInstantiation=*/TemplateLoc,
-        /*Entity=*/AliasTemplate,
-        /*TemplateArgs=*/TemplateArgLists.getInnermost());
 
     // Diagnose uses of this alias.
     (void)DiagnoseUseOfDecl(AliasTemplate, TemplateLoc);
 
+    // FIXME: The TemplateArgs passed here are not used for the context note,
+    // nor they should, because this note will be pointing to the specialization
+    // anyway. These arguments are needed for a hack for instantiating lambdas
+    // in the pattern of the alias. In getTemplateInstantiationArgs, these
+    // arguments will be used for collating the template arguments needed to
+    // instantiate the lambda.
+    InstantiatingTemplate Inst(*this, /*PointOfInstantiation=*/TemplateLoc,
+                               /*Entity=*/AliasTemplate,
+                               /*TemplateArgs=*/CTAI.SugaredConverted);
     if (Inst.isInvalid())
       return QualType();
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1f762ca..7b05e4c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1271,6 +1271,12 @@ void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
                PDiag(diag::note_building_deduction_guide_here));
       break;
     case CodeSynthesisContext::TypeAliasTemplateInstantiation:
+      // Workaround for a workaround: don't produce a note if we are merely
+      // instantiating some other template which contains this alias template.
+      // This would be redundant either with the error itself, or some other
+      // context note attached to it.
+      if (Active->NumTemplateArgs == 0)
+        break;
       DiagFunc(Active->PointOfInstantiation,
                PDiag(diag::note_template_type_alias_instantiation_here)
                    << cast<TypeAliasTemplateDecl>(Active->Entity)
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e2dc703..3819f77 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1580,17 +1580,19 @@ Decl *TemplateDeclInstantiator::InstantiateTypeAliasTemplateDecl(
   if (!InstParams)
     return nullptr;
 
-  TypeAliasDecl *Pattern = D->getTemplatedDecl();
-  Sema::InstantiatingTemplate InstTemplate(
-      SemaRef, D->getBeginLoc(), D,
-      D->getTemplateDepth() >= TemplateArgs.getNumLevels()
-          ? ArrayRef<TemplateArgument>()
-          : (TemplateArgs.begin() + TemplateArgs.getNumLevels() - 1 -
-             D->getTemplateDepth())
-                ->Args);
+  // FIXME: This is a hack for instantiating lambdas in the pattern of the
+  // alias. We are not really instantiating the alias at its template level,
+  // that only happens in CheckTemplateId, this is only for outer templates
+  // which contain it. In getTemplateInstantiationArgs, the template arguments
+  // used here would be used for collating the template arguments needed to
+  // instantiate the lambda. Pass an empty argument list, so this workaround
+  // doesn't get confused if there is an outer alias being instantiated.
+  Sema::InstantiatingTemplate InstTemplate(SemaRef, D->getBeginLoc(), D,
+                                           ArrayRef<TemplateArgument>());
   if (InstTemplate.isInvalid())
     return nullptr;
 
+  TypeAliasDecl *Pattern = D->getTemplatedDecl();
   TypeAliasTemplateDecl *PrevAliasTemplate = nullptr;
   if (getPreviousDeclForInstantiation<TypedefNameDecl>(Pattern)) {
     DeclContext::lookup_result Found = Owner->lookup(Pattern->getDeclName());
diff --git a/clang/test/AST/ast-print-record-decl.c b/clang/test/AST/ast-print-record-decl.c
index d3717a4..fd81588 100644
--- a/clang/test/AST/ast-print-record-decl.c
+++ b/clang/test/AST/ast-print-record-decl.c
@@ -290,9 +290,9 @@ KW DeclGroupInMemberList {
 // A tag decl group in the tag decl's own member list is exercised in
 // defSelfRef above.
 
+#ifdef __cplusplus
 
 // Check out-of-line record definition
-#ifdef __cplusplus
 // PRINT-CXX-NEXT: [[KW]] OutOfLineRecord {
 KW OutOfLineRecord {
   // PRINT-CXX-NEXT: [[KW]] Inner
@@ -304,4 +304,15 @@ KW OutOfLineRecord {
 KW OutOfLineRecord::Inner {
   // PRINT-CXX-NEXT: };
 };
+
+// PRINT-CXX-NEXT: template <typename, typename = int> [[KW]] SmearedTypeDefArgs;
+template <typename, typename = int> KW SmearedTypeDefArgs;
+// PRINT-CXX-NEXT: template <typename = int, typename> [[KW]] SmearedTypeDefArgs;
+template <typename = int, typename> KW SmearedTypeDefArgs;
+
+// PRINT-CXX-NEXT: template <int, int = 0> [[KW]] SmearedNTTPDefArgs;
+template <int, int = 0> KW SmearedNTTPDefArgs;
+// PRINT-CXX-NEXT: template <int = 0, int> [[KW]] SmearedNTTPDefArgs;
+template <int = 0, int> KW SmearedNTTPDefArgs;
+
 #endif
diff --git a/clang/test/SemaTemplate/alias-template-deprecated.cpp b/clang/test/SemaTemplate/alias-template-deprecated.cpp
index 7418e222..6dffd37 100644
--- a/clang/test/SemaTemplate/alias-template-deprecated.cpp
+++ b/clang/test/SemaTemplate/alias-template-deprecated.cpp
@@ -46,23 +46,19 @@ using UsingInstWithCPPAttr [[deprecated("Do not use this")]] = NoAttr<int>;
 void bar() {
   NoAttr<int> obj; // Okay
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   UsingWithAttr<int> objUsingWA;
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   NoAttr<UsingWithAttr<int>> s;
 
   // expected-note@+1 {{'DepInt' has been explicitly marked deprecated here}}
   using DepInt [[deprecated]] = int;
-  // expected-warning@+3 {{'UsingWithAttr' is deprecated}}
-  // expected-warning@+2 {{'DepInt' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
+  // expected-warning@+1 {{'DepInt' is deprecated}}
   using X = UsingWithAttr<DepInt>;
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   UsingWithAttr<int>().foo();
 
   // expected-warning@+1 {{'UsingInstWithAttr' is deprecated}}
@@ -74,8 +70,7 @@ void bar() {
   // expected-warning@+1 {{'UsingTDWithAttr' is deprecated}}
   UsingTDWithAttr objUTDWA;
 
-  // expected-warning@+2 {{'UsingWithCPPAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithCPPAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithCPPAttr' is deprecated}}
   UsingWithCPPAttr<int> objUsingWCPPA;
 
   // expected-warning@+1 {{'UsingInstWithCPPAttr' is deprecated: Do not use this}}
diff --git a/clang/test/SemaTemplate/alias-templates.cpp b/clang/test/SemaTemplate/alias-templates.cpp
index ab5cad7..09fe72e 100644
--- a/clang/test/SemaTemplate/alias-templates.cpp
+++ b/clang/test/SemaTemplate/alias-templates.cpp
@@ -312,3 +312,11 @@ namespace resolved_nttp {
 
   using TC2 = decltype(C<bool, 2, 3>::p); // expected-note {{instantiation of}}
 }
+
+namespace OuterSubstFailure {
+  template <class T> struct A {
+      template <class> using B = T&;
+      // expected-error@-1 {{cannot form a reference to 'void'}}
+  };
+  template struct A<void>; // expected-note {{requested here}}
+} // namespace OuterSubstFailure
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
index f0a8ec1..fd0b5a4 100644
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -16,11 +16,11 @@ int main() {
   wchar_t badDst[9];
   wcscpy(badDst, start);
   fprintf(stderr, "Good so far.\n");
-  // CHECK: Good so far.
+  // CHECK-DAG: Good so far.
   fflush(stderr);
   wcscat(badDst, append); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcscat
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcscat
   printf("Should have failed with ASAN error.\n");
-}
-\ No newline at end of file
+}
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
index a280d29..8133a58 100644
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -13,11 +13,11 @@ int main() {
 
   wchar_t badDst[7];
   fprintf(stderr, "Good so far.\n");
-  // CHECK: Good so far.
+  // CHECK-DAG: Good so far.
   fflush(stderr);
   wcscpy(badDst, src); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcscpy
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcscpy
   printf("Should have failed with ASAN error.\n");
-}
-\ No newline at end of file
+}
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
index eb7d095..365e732 100644
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -17,11 +17,11 @@ int main() {
   wcscpy(badDst, start);
   wcsncat(badDst, append, 1);
   fprintf(stderr, "Good so far.\n");
-  // CHECK: Good so far.
+  // CHECK-DAG: Good so far.
   fflush(stderr);
   wcsncat(badDst, append, 3); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncat
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcsncat
   printf("Should have failed with ASAN error.\n");
-}
-\ No newline at end of file
+}
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
index 1106bf5..485ddc4 100644
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -14,12 +14,11 @@ int main() {
   wchar_t badDst[7];
   wcsncpy(badDst, src, 7); // This should still work.
   fprintf(stderr, "Good so far.\n");
-  // CHECK: Good so far.
+  // CHECK-DAG: Good so far.
   fflush(stderr);
-
   wcsncpy(badDst, src, 15); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncpy
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcsncpy
   printf("Should have failed with ASAN error.\n");
-}
-\ No newline at end of file
+}
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index d3af3ba..2ce0d86 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -372,6 +372,8 @@ public:
     return createCommonLinkage(getContext());
   }
 
+  mlir::StringAttr createExternalLinkage() { return getStringAttr("external"); }
+
   mlir::StringAttr createInternalLinkage() { return getStringAttr("internal"); }
 
   mlir::StringAttr createLinkOnceLinkage() { return getStringAttr("linkonce"); }
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 6e04c71..09126e0 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -143,7 +143,11 @@ struct CUFComputeSharedMemoryOffsetsAndSize
       auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
       std::string sharedMemGlobalName =
           (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
-      mlir::StringAttr linkage = builder.createInternalLinkage();
+      // Dynamic shared memory needs an external linkage while static shared
+      // memory needs an internal linkage.
+      mlir::StringAttr linkage = nbDynamicSharedVariables > 0
+                                     ? builder.createExternalLinkage()
+                                     : builder.createInternalLinkage();
       builder.setInsertionPointToEnd(gpuMod.getBody());
       llvm::SmallVector<mlir::NamedAttribute> attrs;
       auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 29316c9..9c057d0 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>       
 // CHECK: gpu.return
 // CHECK: }
-// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
 
 // -----
 
@@ -158,3 +158,5 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-LABEL: gpu.func @_QMmtestsPtestany
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
+
+// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 7960f34..ba0fc4b 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -225,7 +225,7 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
   gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
   gfx1100 gfx1101 gfx1102 gfx1103
   gfx1150 gfx1151 gfx1152 gfx1153
-  gfx1200 gfx1201
+  gfx1200 gfx1201 gfx1250 gfx1251
 )
 
 # pkg-config file
diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 571caf9..be690a4 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -59,6 +59,14 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type);
 /// True if the AllocTypes bitmask contains just a single type.
 LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes);
 
+/// Removes any existing "ambiguous" memprof attribute. Called before we apply a
+/// specific allocation type such as "cold", "notcold", or "hot".
+LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
+
+/// Adds an "ambiguous" memprof attribute to call with a matched allocation
+/// profile but that we haven't yet been able to disambiguate.
+LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
+
 /// Class to build a trie of call stack contexts for a particular profiled
 /// allocation call, along with their associated allocation types.
 /// The allocation will be at the root of the trie, which is then used to
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 0c1f8db..92a5b6f 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -54,6 +54,10 @@ cl::opt<unsigned> MinPercentMaxColdSize(
     "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden,
     cl::desc("Min percent of max cold bytes for critical cold context"));
 
+LLVM_ABI cl::opt<bool> MemProfUseAmbiguousAttributes(
+    "memprof-ambiguous-attributes", cl::init(true), cl::Hidden,
+    cl::desc("Apply ambiguous memprof attribute to ambiguous allocations"));
+
 } // end namespace llvm
 
 bool llvm::memprof::metadataIncludesAllContextSizeInfo() {
@@ -125,6 +129,26 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
+void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) {
+  if (!CB->hasFnAttr("memprof"))
+    return;
+  assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  CB->removeFnAttr("memprof");
+}
+
+void llvm::memprof::addAmbiguousAttribute(CallBase *CB) {
+  if (!MemProfUseAmbiguousAttributes)
+    return;
+  // We may have an existing ambiguous attribute if we are reanalyzing
+  // after inlining.
+  if (CB->hasFnAttr("memprof")) {
+    assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  } else {
+    auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous");
+    CB->addFnAttr(A);
+  }
+}
+
 void CallStackTrie::addCallStack(
     AllocationType AllocType, ArrayRef<uint64_t> StackIds,
     std::vector<ContextTotalSize> ContextSizeInfo) {
@@ -470,6 +494,9 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
                                                 StringRef Descriptor) {
   auto AllocTypeString = getAllocTypeAttributeString(AT);
   auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString);
+  // After inlining we may be able to convert an existing ambiguous allocation
+  // to an unambiguous one.
+  removeAnyExistingAmbiguousAttribute(CI);
   CI->addFnAttr(A);
   if (MemProfReportHintedSizes) {
     std::vector<ContextTotalSize> ContextSizeInfo;
@@ -529,6 +556,7 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
     assert(MIBCallStack.size() == 1 &&
            "Should only be left with Alloc's location in stack");
     CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
+    addAmbiguousAttribute(CI);
     return true;
   }
   // If there exists corner case that CallStackTrie has one chain to leaf
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index faeab95..cfdfd94 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3986,6 +3986,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 void ModuleCallsiteContextGraph::updateAllocationCall(
     CallInfo &Call, AllocationType AllocType) {
   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+  removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
                                 "memprof", AllocTypeString);
   cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5661,9 +5662,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
 
         // Include allocs that were already assigned a memprof function
-        // attribute in the statistics.
-        if (CB->getAttributes().hasFnAttr("memprof")) {
-          assert(!MemProfMD);
+        // attribute in the statistics. Only do this for those that do not have
+        // memprof metadata, since we add an "ambiguous" memprof attribute by
+        // default.
+        if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
               ? AllocTypeColdThinBackend++
               : AllocTypeNotColdThinBackend++;
@@ -5740,6 +5742,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
               // clone J-1 (J==0 is the original clone and does not have a VMaps
               // entry).
               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            removeAnyExistingAmbiguousAttribute(CBClone);
             CBClone->addFnAttr(A);
             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
                      << ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb6bfb2..56a3d6d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,8 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4161,8 +4160,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6854,7 +6852,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7087,8 +7085,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
-                        *CM.PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8624,8 +8621,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                          *CM.PSE.getSE());
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10079,7 +10075,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind, *CM.PSE.getSE());
+                          CM.CostKind);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 2555ebe..07b191a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1772,8 +1772,7 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
-    bool AlwaysIncludeReplicatingR) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
   if (VF.isScalar())
     return 0;
 
@@ -1793,11 +1792,7 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() ||
-        (!AlwaysIncludeReplicatingR &&
-         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
-        (isa<VPReplicateRecipe>(Op) &&
-         cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
+    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 1580a3b..fc1a09e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,14 +349,12 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
-  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind,
-                ScalarEvolution &SE)
+                TargetTransformInfo::TargetCostKind CostKind)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind), SE(SE) {}
+        CostKind(CostKind) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -376,12 +374,10 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
-  /// is true, always compute the cost of scalarizing replicating operands.
-  InstructionCost
-  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
-                           ElementCount VF,
-                           bool AlwaysIncludeReplicatingR = false);
+  /// type-based getScalarizationOverhead API.
+  InstructionCost getScalarizationOverhead(Type *ResultTy,
+                                           ArrayRef<const VPValue *> Operands,
+                                           ElementCount VF);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 43d61f2..67b9244 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,7 +40,6 @@
 #include <cassert>
 
 using namespace llvm;
-using namespace llvm::VPlanPatternMatch;
 
 using VectorParts = SmallVector<Value *, 2>;
 
@@ -304,6 +303,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   VPRecipeBase *OpR = Op->getDefiningRecipe();
 
   // If the partial reduction is predicated, a select will be operand 0
+  using namespace llvm::VPlanPatternMatch;
   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
     OpR = Op->getDefiningRecipe();
   }
@@ -1963,6 +1963,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
 
   VPValue *Op0, *Op1;
+  using namespace llvm::VPlanPatternMatch;
   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3110,62 +3111,6 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
-  auto *PtrR = Ptr->getDefiningRecipe();
-  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
-                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
-                      Instruction::GetElementPtr) ||
-                 isa<VPWidenGEPRecipe>(PtrR) ||
-                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
-    return false;
-
-  // We are looking for a GEP where all indices are either loop invariant or
-  // inductions.
-  for (VPValue *Opd : drop_begin(PtrR->operands())) {
-    if (!Opd->isDefinedOutsideLoopRegions() &&
-        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return false;
-  }
-
-  return true;
-}
-
-/// Returns true if \p V is used as part of the address of another load or
-/// store.
-static bool isUsedByLoadStoreAddress(const VPUser *V) {
-  SmallPtrSet<const VPUser *, 4> Seen;
-  SmallVector<const VPUser *> WorkList = {V};
-
-  while (!WorkList.empty()) {
-    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second)
-      continue;
-
-    for (VPUser *U : Cur->users()) {
-      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
-        if (InterleaveR->getAddr() == Cur)
-          return true;
-      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
-        if (RepR->getOpcode() == Instruction::Load &&
-            RepR->getOperand(0) == Cur)
-          return true;
-        if (RepR->getOpcode() == Instruction::Store &&
-            RepR->getOperand(1) == Cur)
-          return true;
-      }
-      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
-        if (MemR->getAddr() == Cur && MemR->isConsecutive())
-          return true;
-      }
-    }
-
-    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
-  }
-  return false;
-}
-
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3273,58 +3218,21 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (VF.isScalable() && !isSingleScalar())
-      return InstructionCost::getInvalid();
-
+    if (isSingleScalar()) {
+      bool IsLoad = UI->getOpcode() == Instruction::Load;
+      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+      const Align Alignment = getLoadStoreAlignment(UI);
+      unsigned AS = getLoadStoreAddressSpace(UI);
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+    }
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    const VPRegionBlock *ParentRegion = getParent()->getParent();
-    if (ParentRegion && ParentRegion->isReplicator())
-      break;
-
-    bool IsLoad = UI->getOpcode() == Instruction::Load;
-    const VPValue *PtrOp = getOperand(!IsLoad);
-    // TODO: Handle cases where we need to pass a SCEV to
-    // getAddressComputationCost.
-    if (shouldUseAddressAccessSCEV(PtrOp))
-      break;
-
-    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
-    const Align Alignment = getLoadStoreAlignment(UI);
-    unsigned AS = getLoadStoreAddressSpace(UI);
-    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
-
-    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-
-    InstructionCost ScalarCost =
-        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
-    if (isSingleScalar())
-      return ScalarCost;
-
-    SmallVector<const VPValue *> OpsToScalarize;
-    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
-    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
-    // don't assign scalarization overhead in general, if the target prefers
-    // vectorized addressing or the loaded value is used as part of an address
-    // of another load or store.
-    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
-      bool EfficientVectorLoadStore =
-          Ctx.TTI.supportsEfficientVectorElementLoadStore();
-      if (!(IsLoad && !PreferVectorizedAddressing) &&
-          !(!IsLoad && EfficientVectorLoadStore))
-        append_range(OpsToScalarize, operands());
-
-      if (!EfficientVectorLoadStore)
-        ResultTy = Ctx.Types.inferScalarType(this);
-    }
-
-    return (ScalarCost * VF.getFixedValue()) +
-           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
+    break;
   }
   }
 
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 0ff0ce0..537e1b8 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -103,7 +103,9 @@ declare i32 @sleep()
 
 define internal ptr @_Z3barv() #0 !dbg !15 {
 entry:
-  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ;; Use an ambiguous attribute for this allocation, which is now added to such
+  ;; allocations during matching. It should not affect cloning.
+  %call = call ptr @_Znam(i64 0) #1, !memprof !2, !callsite !7
   ret ptr null
 }
 
@@ -125,6 +127,7 @@ entry:
 uselistorder ptr @_Z3foov, { 1, 0 }
 
 attributes #0 = { noinline optnone }
+attributes #1 = { "memprof"="ambiguous" }
 
 !llvm.dbg.cu = !{!13}
 !llvm.module.flags = !{!20, !21}
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index c69d031..f6a89a8 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -38,7 +38,7 @@
 ; ALL-NOT: no profile data available for function
 
 ;; Using a memprof-only profile for memprof-use should only give memprof metadata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-print-match-info -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,MEMPROFMATCHINFO,MEMPROFSTATS
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-print-match-info -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,MEMPROFMATCHINFO,MEMPROFSTATS,AMBIG
 ; There should not be any PGO metadata
 ; MEMPROFONLY-NOT: !prof
 
@@ -51,10 +51,10 @@
 ;; Test the same thing but by passing the memory profile through to a default
 ;; pipeline via -memory-profile-file=, which should cause the necessary field
 ;; of the PGOOptions structure to be populated with the profile filename.
-; RUN: opt < %s -passes='default<O2>' -memory-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY
+; RUN: opt < %s -passes='default<O2>' -memory-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,AMBIG
 
 ;; Using a pgo+memprof profile for memprof-use should only give memprof metadata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,AMBIG
 
 ;; Using a pgo-only profile for memprof-use should give an error
 ; RUN: not opt < %s -passes='memprof-use<profile-filename=%t.pgoprofdata>' -S 2>&1 | FileCheck %s --check-prefixes=MEMPROFWITHPGOONLY
@@ -72,7 +72,7 @@
 
 ;; Using a pgo+memprof profile for both memprof-use and pgo-instr-use should
 ;; give both memprof and pgo metadata.
-; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO
+; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO,AMBIG
 
 ;; Check that the total sizes are reported if requested. A message should be
 ;; emitted for the pruned context. Also check that remarks are emitted for the
@@ -108,7 +108,11 @@
 
 ;; However, with the same threshold, but hot hints not enabled, it should be
 ;; notcold again.
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,AMBIG
+
+;; Test that we don't get an ambiguous memprof attribute when
+;; -memprof-ambiguous-attributes is disabled.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-ambiguous-attributes=false 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,NOAMBIG
 
 ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched with 1 frames
 ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched with 1 frames
@@ -140,7 +144,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; PGO: !prof
 define dso_local noundef ptr @_Z3foov() #0 !dbg !10 {
 entry:
-  ; MEMPROF: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A0:[0-9]+]]{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
   ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !13
   ret ptr %call, !dbg !14
@@ -364,6 +368,9 @@ for.end:                                          ; preds = %for.cond
   ret i32 0, !dbg !103
 }
 
+;; We optionally apply an ambiguous memprof attribute to ambiguous allocations
+; AMBIG: #[[A0]] = { builtin allocsize(0) "memprof"="ambiguous" }
+; NOAMBIG: #[[A0]] = { builtin allocsize(0) }
 ; MEMPROF: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
 ; MEMPROF: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
 ; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index e376d82..a2f4b3e 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -328,8 +328,10 @@ int llvm_test_dibuilder(void) {
   // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for
   // instructions without debug info.
   LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1);
+  (void)Phi1FirstDbgRecord;
   assert(Phi1FirstDbgRecord == NULL);
   LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1);
+  (void)Phi1LastDbgRecord;
   assert(Phi1LastDbgRecord == NULL);
 
   // Insert a non-phi before the `ret` but not before the debug records to
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d8457a3..d1c0f64 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -230,7 +230,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -279,7 +280,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -333,7 +335,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -392,7 +395,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -463,7 +467,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   ASSERT_NE(Call, nullptr);
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -536,7 +541,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // Restore original option value.
   MemProfKeepAllNotColdContexts = OrigMemProfKeepAllNotColdContexts;
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -664,7 +670,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // The hot allocations will be converted to NotCold and pruned as they
   // are unnecessary to determine how to clone the cold allocation.
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 5448976f..bccf5d5b 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3377,6 +3377,9 @@ func.func @negative_from_elements_to_constant() -> vector<1x!llvm.ptr> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison
 //       CHECK:   %[[VAL:.*]] = ub.poison : vector<2xf32>
 //       CHECK:   return %[[VAL]] : vector<2xf32>
@@ -3388,6 +3391,9 @@ func.func @negative_from_elements_poison_f32() -> vector<2xf32> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison_i32
 //       CHECK:   %[[VAL:.*]] = ub.poison : vector<2xi32>
 //       CHECK:   return %[[VAL]] : vector<2xi32>
@@ -3399,6 +3405,9 @@ func.func @negative_from_elements_poison_i32() -> vector<2xi32> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison_constant_mix
 //       CHECK:   %[[POISON:.*]] = ub.poison : f32
 //       CHECK:   %[[CONST:.*]] = arith.constant 1.000000e+00 : f32