34 files changed, 335 insertions, 117 deletions
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 6fba211..b17cc80 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -584,12 +584,20 @@ static SmallString<16> determineFileName(Info *I, SmallString<128> &Path) {
     FileName = RecordSymbolInfo->MangledName;
   } else if (I->USR == GlobalNamespaceID)
     FileName = "index";
-  else
+  else if (I->IT == InfoType::IT_namespace) {
+    for (const auto &NS : I->Namespace) {
+      FileName += NS.Name;
+      FileName += "_";
+    }
+    FileName += I->Name;
+  } else
     FileName = I->Name;
   sys::path::append(Path, FileName + ".json");
   return FileName;
 }
 
+// FIXME: Revert back to creating nested directories for namespaces instead of
+// putting everything in a flat directory structure.
 Error JSONGenerator::generateDocs(
     StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
     const ClangDocContext &CDCtx) {
diff --git a/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp
new file mode 100644
index 0000000..04fcfc1
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp
@@ -0,0 +1,20 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/json/foo_tools.json --check-prefix=CHECK-FOO
+// RUN: FileCheck %s < %t/json/bar_tools.json --check-prefix=CHECK-BAR
+
+namespace foo {
+  namespace tools {
+    class FooTools {};
+  } // namespace tools
+} // namespace foo
+
+namespace bar {
+  namespace tools {
+    class BarTools {};
+  } // namespace tools
+} // namespace bar
+
+// CHECK-FOO: "Name": "tools"
+
+// CHECK-BAR: "Name": "tools"
diff --git a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
index b19afc1..cf19e1e 100644
--- a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
 // RUN: FileCheck %s < %t/json/nested.json --check-prefix=NESTED
-// RUN: FileCheck %s < %t/json/inner.json --check-prefix=INNER
+// RUN: FileCheck %s < %t/json/nested_inner.json --check-prefix=INNER
 
 namespace nested {
   int Global;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index d924cb4..90e1f8d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -72,6 +72,8 @@ CODEGENOPT(EnableNoundefAttrs, 1, 0, Benign) ///< Enable emitting `noundef` attr
 CODEGENOPT(DebugPassManager, 1, 0, Benign) ///< Prints debug information for the new
                                            ///< pass manager.
 CODEGENOPT(DisableRedZone    , 1, 0, Benign) ///< Set when -mno-red-zone is enabled.
+CODEGENOPT(CallGraphSection, 1, 0, Benign) ///< Emit a call graph section into the
+                                           ///< object file.
 CODEGENOPT(EmitCallSiteInfo, 1, 0, Benign) ///< Emit call site info only in the case of
                                            ///< '-g' + 'O>0' level.
 CODEGENOPT(IndirectTlsSegRefs, 1, 0, Benign) ///< Set when -mno-tls-direct-seg-refs
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c2f2ac5..a55a523 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4534,6 +4534,12 @@ defm data_sections : BoolFOption<"data-sections",
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
           "Place each data in its own section">,
   NegFlag<SetFalse>>;
+defm experimental_call_graph_section
+    : BoolFOption<"experimental-call-graph-section",
+                  CodeGenOpts<"CallGraphSection">, DefaultFalse,
+                  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+                          "Emit a call graph section">,
+                  NegFlag<SetFalse>>;
 defm stack_size_section : BoolFOption<"stack-size-section",
   CodeGenOpts<"StackSizeSection">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 2d95982..f8e8086 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -473,6 +473,7 @@ static bool initTargetOptions(const CompilerInstance &CI,
   Options.StackUsageOutput = CodeGenOpts.StackUsageOutput;
   Options.EmitAddrsig = CodeGenOpts.Addrsig;
   Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection;
+  Options.EmitCallGraphSection = CodeGenOpts.CallGraphSection;
   Options.EmitCallSiteInfo = CodeGenOpts.EmitCallSiteInfo;
   Options.EnableAIXExtendedAltivecABI = LangOpts.EnableAIXExtendedAltivecABI;
   Options.XRayFunctionIndex = CodeGenOpts.XRayFunctionIndex;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d326a81..bf75573 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6442,6 +6442,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(A->getValue());
   }
 
+  if (Args.hasFlag(options::OPT_fexperimental_call_graph_section,
+                   options::OPT_fno_experimental_call_graph_section, false))
+    CmdArgs.push_back("-fexperimental-call-graph-section");
+
   Args.addOptInFlag(CmdArgs, options::OPT_fstack_size_section,
                     options::OPT_fno_stack_size_section);
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 16cc1db..99400ac 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1272,6 +1272,11 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-stack-size-section"));
 
+  if (Args.hasFlag(options::OPT_fexperimental_call_graph_section,
+                   options::OPT_fno_experimental_call_graph_section, false))
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine(PluginOptPrefix) + "-call-graph-section"));
+
   // Setup statistics file output.
   SmallString<128> StatsFile = getStatsFileName(Args, Output, *Input, D);
   if (!StatsFile.empty())
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 86a5185..ab32938 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -93,12 +93,6 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
     auto &Tok = *Tokens.back();
     const auto NewlinesBefore = Tok.NewlinesBefore;
     switch (FormatOff) {
-    case FO_CurrentLine:
-      if (NewlinesBefore == 0)
-        Tok.Finalized = true;
-      else
-        FormatOff = FO_None;
-      break;
     case FO_NextLine:
       if (NewlinesBefore > 1) {
         FormatOff = FO_None;
@@ -107,6 +101,13 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
         FormatOff = FO_CurrentLine;
       }
       break;
+    case FO_CurrentLine:
+      if (NewlinesBefore == 0) {
+        Tok.Finalized = true;
+        break;
+      }
+      FormatOff = FO_None;
+      [[fallthrough]];
     default:
       if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
         if (Tok.is(tok::comment) &&
diff --git a/clang/test/Driver/call-graph-section.c b/clang/test/Driver/call-graph-section.c
new file mode 100644
index 0000000..00fa896
--- /dev/null
+++ b/clang/test/Driver/call-graph-section.c
@@ -0,0 +1,5 @@
+// RUN: %clang -### -fexperimental-call-graph-section %s 2>&1 | FileCheck --check-prefix=CALL-GRAPH-SECTION %s
+// RUN: %clang -### -fexperimental-call-graph-section -fno-experimental-call-graph-section %s 2>&1 | FileCheck --check-prefix=NO-CALL-GRAPH-SECTION %s
+
+// CALL-GRAPH-SECTION: "-fexperimental-call-graph-section"
+// NO-CALL-GRAPH-SECTION-NOT: "-fexperimental-call-graph-section"
diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt
index 52e7d28..e0acf43 100644
--- a/clang/unittests/Analysis/CMakeLists.txt
+++ b/clang/unittests/Analysis/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_unittest(ClangAnalysisTests
   clangAST
   clangASTMatchers
   clangAnalysis
+  clangAnalysisLifetimeSafety
   clangBasic
   clangFrontend
   clangLex
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index fef7036..450c34f 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -24843,6 +24843,11 @@ TEST_F(FormatTest, OneLineFormatOffRegex) {
                " } while (0 )",
                Style);
 
+  Style.OneLineFormatOffRegex = "MACRO_TEST";
+  verifyNoChange(" MACRO_TEST1 ( ) ;\n"
+                 "   MACRO_TEST2( );",
+                 Style);
+
   Style.ColumnLimit = 50;
   Style.OneLineFormatOffRegex = "^LogErrorPrint$";
   verifyFormat(" myproject::LogErrorPrint(logger, \"Don't split me!\");\n"
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 1f7da10..0e3c9aa2 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -274,6 +274,7 @@ struct IntrinsicLibrary {
                                       llvm::ArrayRef<fir::ExtendedValue>);
   template <Extremum, ExtremumBehavior>
   mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genFraction(mlir::Type resultType,
                           mlir::ArrayRef<mlir::Value> args);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index b3a2d49..7c5c5fb 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -502,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genExtendsTypeOf,
      {{{"a", asBox}, {"mold", asBox}}},
      /*isElemental=*/false},
+    {"fence_proxy_async",
+     &I::genFenceProxyAsync,
+     {},
+     /*isElemental=*/false},
     {"findloc",
      &I::genFindloc,
      {{{"array", asBox},
@@ -4367,6 +4371,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
                                      fir::getBase(args[1])));
 }
 
+// FENCE_PROXY_ASYNC (CUDA)
+void IntrinsicLibrary::genFenceProxyAsync(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
 // FINDLOC
 fir::ExtendedValue
 IntrinsicLibrary::genFindloc(mlir::Type resultType,
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index afb39eb..106f3e2 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -2009,6 +2009,11 @@ implicit none
   end interface
 
   interface
+    attributes(device) subroutine fence_proxy_async()
+    end subroutine
+  end interface
+
+  interface
     attributes(device) subroutine tma_bulk_commit_group()
     end subroutine
   end interface
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index eef4225..697b17b 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -421,6 +421,14 @@ end subroutine
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
 ! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
 
+
+attributes(global) subroutine test_fence()
+  call fence_proxy_async()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_fence()
+! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
+
 attributes(global) subroutine test_tma()
   call tma_bulk_commit_group()
   call tma_bulk_wait_group()
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 82b9250..e3f7965 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -49,6 +49,7 @@
 #include "math/exp10m1f16.h"
 #include "math/exp2.h"
 #include "math/exp2f.h"
+#include "math/exp2f16.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2f16.h b/libc/shared/math/exp2f16.h
new file mode 100644
index 0000000..f799511
--- /dev/null
+++ b/libc/shared/math/exp2f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp2f16 function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2F16_H
+#define LLVM_LIBC_SHARED_MATH_EXP2F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/exp2f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2F16_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 61253de..9685496 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -738,6 +738,20 @@ add_header_library(
 )
 
 add_header_library(
+  exp2f16
+  HDRS
+    exp2f16.h
+  DEPENDS
+    .expxf16_utils
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
   exp10
   HDRS
     exp10.h
diff --git a/libc/src/__support/math/exp2f16.h b/libc/src/__support/math/exp2f16.h
new file mode 100644
index 0000000..599ba0f
--- /dev/null
+++ b/libc/src/__support/math/exp2f16.h
@@ -0,0 +1,111 @@
+//===-- Implementation header for exp2f16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "expxf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 exp2f16(float16 x) {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr fputil::ExceptValues<float16, 3> EXP2F16_EXCEPTS = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.714p-11, exp2f16(x) = 0x1p+0 (RZ)
+      {0x11c5U, 0x3c00U, 1U, 0U, 1U},
+      // x = -0x1.558p-4, exp2f16(x) = 0x1.e34p-1 (RZ)
+      {0xad56U, 0x3b8dU, 1U, 0U, 0U},
+      // x = -0x1.d5cp-4, exp2f16(x) = 0x1.d8cp-1 (RZ)
+      {0xaf57U, 0x3b63U, 1U, 0U, 0U},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using namespace math::expxf16_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| >= 16, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs >= 0x4c00U)) {
+    // exp2(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16.
+    if (x_bits.is_pos()) {
+      // exp2(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -25.
+    if (x_u >= 0xce40U) {
+      // exp2(-inf) = +0
+      if (x_bits.is_inf())
+        return FPBits::zero().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+      if (fputil::fenv_is_round_up())
+        return FPBits::min_subnormal().get_val();
+      return FPBits::zero().get_val();
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP2F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // exp2(x) = exp2(hi + mid) * exp2(lo)
+  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
+  return fputil::cast<float16>(exp2_hi_mid * exp2_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 55f4aaf..0754b5e 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1478,15 +1478,7 @@ add_entrypoint_object(
   HDRS
     ../exp2f16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.expxf16_utils
+    libc.src.__support.math.exp2f16
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/exp2f16.cpp b/libc/src/math/generic/exp2f16.cpp
index 5db0c3a..80799d4 100644
--- a/libc/src/math/generic/exp2f16.cpp
+++ b/libc/src/math/generic/exp2f16.cpp
@@ -7,92 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2f16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/math/expxf16_utils.h"
+#include "src/__support/math/exp2f16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr fputil::ExceptValues<float16, 3> EXP2F16_EXCEPTS = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.714p-11, exp2f16(x) = 0x1p+0 (RZ)
-    {0x11c5U, 0x3c00U, 1U, 0U, 1U},
-    // x = -0x1.558p-4, exp2f16(x) = 0x1.e34p-1 (RZ)
-    {0xad56U, 0x3b8dU, 1U, 0U, 0U},
-    // x = -0x1.d5cp-4, exp2f16(x) = 0x1.d8cp-1 (RZ)
-    {0xaf57U, 0x3b63U, 1U, 0U, 0U},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, exp2f16, (float16 x)) {
-  using namespace math::expxf16_internal;
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When |x| >= 16, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs >= 0x4c00U)) {
-    // exp2(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 16.
-    if (x_bits.is_pos()) {
-      // exp2(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
-
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x <= -25.
-    if (x_u >= 0xce40U) {
-      // exp2(-inf) = +0
-      if (x_bits.is_inf())
-        return FPBits::zero().get_val();
-
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-
-      if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
-      return FPBits::zero().get_val();
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP2F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // exp2(x) = exp2(hi + mid) * exp2(lo)
-  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
-  return fputil::cast<float16>(exp2_hi_mid * exp2_lo);
-}
+LLVM_LIBC_FUNCTION(float16, exp2f16, (float16 x)) { return math::exp2f16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index f341d3f..8d81199 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -42,6 +42,7 @@ add_fp_unittest(
     libc.src.__support.math.exp
     libc.src.__support.math.exp2
     libc.src.__support.math.exp2f
+    libc.src.__support.math.exp2f16
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 477b7ec..84787d5 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -28,7 +28,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::cospif16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp10m1f16(0.0f16));
-
+  EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp2f16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
 
   ASSERT_FP_EQ(float16(8 << 5), LIBC_NAMESPACE::shared::ldexpf16(8.0f16, 5));
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e62d57e..50136a8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9348,13 +9348,12 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
   Header->setName("vec.epilog.vector.body");
 
-  // Ensure that the start values for all header phi recipes are updated before
-  // vectorizing the epilogue loop.
   VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV();
-  // When vectorizing the epilogue loop, the canonical induction start
-  // value needs to be changed from zero to the value after the main
-  // vector loop. Find the resume value created during execution of the main
-  // VPlan. It must be the first phi in the loop preheader.
+  // When vectorizing the epilogue loop, the canonical induction needs to be
+  // adjusted by the value after the main vector loop. Find the resume value
+  // created during execution of the main VPlan. It must be the first phi in the
+  // loop preheader. Use the value to increment the canonical IV, and update all
+  // users in the loop region to use the adjusted value.
   // FIXME: Improve modeling for canonical IV start values in the epilogue
   // loop.
   using namespace llvm::PatternMatch;
@@ -9389,10 +9388,16 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
                 }) &&
          "the canonical IV should only be used by its increment or "
          "ScalarIVSteps when resetting the start value");
-  IV->setOperand(0, VPV);
+  VPBuilder Builder(Header, Header->getFirstNonPhi());
+  VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
+  IV->replaceAllUsesWith(Add);
+  Add->setOperand(0, IV);
 
   DenseMap<Value *, Value *> ToFrozen;
   SmallVector<Instruction *> InstsToMove;
+  // Ensure that the start values for all header phi recipes are updated before
+  // vectorizing the epilogue loop. Skip the canonical IV, which has been
+  // handled above.
   for (VPRecipeBase &R : drop_begin(Header->phis())) {
     Value *ResumeV = nullptr;
     // TODO: Move setting of resume values to prepareToExecute.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c8a2d84..7563cd7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1234,6 +1234,18 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   if (!Plan->isUnrolled())
     return;
 
+  // Hoist an invariant increment Y of a phi X, by having X start at Y.
+  if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
+      isa<VPPhi>(X)) {
+    auto *Phi = cast<VPPhi>(X);
+    if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
+        Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
+      Phi->setOperand(0, Y);
+      Def->replaceAllUsesWith(Phi);
+      return;
+    }
+  }
+
   // VPVectorPointer for part 0 can be replaced by their start pointer.
   if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
     if (VecPtr->isFirstPart()) {
diff --git a/llvm/test/tools/llvm-remarkutil/filter.test b/llvm/test/tools/llvm-remarkutil/filter.test
index 8304b9f..9fd2e94 100644
--- a/llvm/test/tools/llvm-remarkutil/filter.test
+++ b/llvm/test/tools/llvm-remarkutil/filter.test
@@ -18,9 +18,19 @@ RUN: llvm-remarkutil filter --remark-type=analysis %p/Inputs/filter.yaml | FileC
 
 RUN: llvm-remarkutil yaml2bitstream -o %t.opt.bitstream %p/Inputs/filter.yaml
 RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream | FileCheck %s --strict-whitespace --check-prefix=REMARK1
+RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream -o %t.r1.yamL
+RUN: cat %t.r1.yamL | FileCheck %s --strict-whitespace --check-prefix=REMARK1
+RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream -o %t.r1.yMl
+RUN: cat %t.r1.yMl | FileCheck %s --strict-whitespace --check-prefix=REMARK1
+RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream --serializer=yaml -o %t.r1.fake.opt.bitstream
+RUN: cat %t.r1.fake.opt.bitstream | FileCheck %s --strict-whitespace --check-prefix=REMARK1
 
 RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream -o %t.r1.opt.bitstream
 RUN: llvm-remarkutil bitstream2yaml %t.r1.opt.bitstream | FileCheck %s --strict-whitespace --check-prefix=REMARK1
+RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream -o %t.r1
+RUN: llvm-remarkutil bitstream2yaml %t.r1 | FileCheck %s --strict-whitespace --check-prefix=REMARK1
+RUN: llvm-remarkutil filter --function=func1 %p/Inputs/filter.yaml --serializer=bitstream -o %t.r1.fake.yaml
+RUN: llvm-remarkutil bitstream2yaml %t.r1.fake.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1
 
 RUN: llvm-remarkutil filter --function=func %p/Inputs/filter.yaml | FileCheck %s --allow-empty --strict-whitespace --check-prefix=EMPTY
 
diff --git a/llvm/tools/llvm-remarkutil/RemarkFilter.cpp b/llvm/tools/llvm-remarkutil/RemarkFilter.cpp
index 507ae36..9b521b4 100644
--- a/llvm/tools/llvm-remarkutil/RemarkFilter.cpp
+++ b/llvm/tools/llvm-remarkutil/RemarkFilter.cpp
@@ -48,12 +48,8 @@ static Error tryFilter() {
     return MaybeParser.takeError();
   auto &Parser = **MaybeParser;
 
-  Format SerializerFormat = OutputFormat;
-  if (SerializerFormat == Format::Auto) {
-    SerializerFormat = Parser.ParserFormat;
-    if (OutputFileName.empty() || OutputFileName == "-")
-      SerializerFormat = Format::YAML;
-  }
+  Format SerializerFormat =
+      getSerializerFormat(OutputFileName, OutputFormat, Parser.ParserFormat);
 
   auto MaybeOF = getOutputFileForRemarks(OutputFileName, SerializerFormat);
   if (!MaybeOF)
diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp
index be52948..b6204d0 100644
--- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp
+++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp
@@ -54,6 +54,20 @@ getOutputFileForRemarks(StringRef OutputFileName, Format OutputFormat) {
                                                     : sys::fs::OF_None);
 }
 
+Format getSerializerFormat(StringRef OutputFileName, Format SelectedFormat,
+                           Format DefaultFormat) {
+  if (SelectedFormat != Format::Auto)
+    return SelectedFormat;
+  SelectedFormat = DefaultFormat;
+  if (OutputFileName.empty() || OutputFileName == "-" ||
+      OutputFileName.ends_with_insensitive(".yaml") ||
+      OutputFileName.ends_with_insensitive(".yml"))
+    SelectedFormat = Format::YAML;
+  if (OutputFileName.ends_with_insensitive(".bitstream"))
+    SelectedFormat = Format::Bitstream;
+  return SelectedFormat;
+}
+
 Expected<FilterMatcher>
 FilterMatcher::createRE(const llvm::cl::opt<std::string> &Arg) {
   return createRE(Arg.ArgStr, Arg);
diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
index 0dd550765..73867fe 100644
--- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
+++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
@@ -47,7 +47,8 @@
       "serializer", cl::init(Format::Auto),                                    \
       cl::desc("Output remark format to serialize"),                           \
       cl::values(clEnumValN(Format::Auto, "auto",                              \
-                            "Follow the parser format (default)"),             \
+                            "Automatic detection based on output file "        \
+                            "extension or parser format (default)"),           \
                  clEnumValN(Format::YAML, "yaml", "YAML"),                     \
                  clEnumValN(Format::Bitstream, "bitstream", "Bitstream")),     \
       cl::sub(SUBOPT));
@@ -151,6 +152,12 @@ getOutputFileWithFlags(StringRef OutputFileName, sys::fs::OpenFlags Flags);
 Expected<std::unique_ptr<ToolOutputFile>>
 getOutputFileForRemarks(StringRef OutputFileName, Format OutputFormat);
 
+/// Choose the serializer format. If \p SelectedFormat is Format::Auto, try to
+/// detect the format based on the extension of \p OutputFileName or fall back
+/// to \p DefaultFormat.
+Format getSerializerFormat(StringRef OutputFileName, Format SelectedFormat,
+                           Format DefaultFormat);
+
 /// Filter object which can be either a string or a regex to match with the
 /// remark properties.
 class FilterMatcher {
diff --git a/mlir/test/Examples/standalone/test.wheel.toy b/mlir/test/Examples/standalone/test.wheel.toy
index 5ff9271..c8d188a 100644
--- a/mlir/test/Examples/standalone/test.wheel.toy
+++ b/mlir/test/Examples/standalone/test.wheel.toy
@@ -2,6 +2,7 @@
 # than 255 chars when combined with the fact that pip wants to install into a tmp directory buried under
 # C/Users/ContainerAdministrator/AppData/Local/Temp.
 # UNSUPPORTED: target={{.*(windows).*}}
+# REQUIRES: expensive_checks
 # REQUIRES: non-shared-libs-build
 # REQUIRES: bindings-python
 
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index f99c24d..6ff12d6 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -348,6 +348,9 @@ if config.enable_assertions:
 else:
     config.available_features.add("noasserts")
 
+if config.expensive_checks:
+    config.available_features.add("expensive_checks")
+
 def have_host_jit_feature_support(feature_name):
     mlir_runner_exe = lit.util.which("mlir-runner", config.mlir_tools_dir)
 
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 1aaf798..91a71af 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -11,6 +11,7 @@ config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.python_executable = "@Python3_EXECUTABLE@"
 config.enable_assertions = @ENABLE_ASSERTIONS@
+config.expensive_checks = "@EXPENSIVE_CHECKS@"
 config.native_target = "@LLVM_NATIVE_ARCH@"
 config.host_os = "@HOST_OS@"
 config.host_cc = "@HOST_CC@"
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 640fa03..936bc12 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2917,6 +2917,22 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_math_exp2f16",
+    hdrs = ["src/__support/math/exp2f16.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fma",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_expxf16_utils",
+    ],
+)
+
+libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
     deps = [
@@ -3696,7 +3712,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2f16",
     additional_deps = [
-        ":__support_math_expxf16_utils",
+        ":__support_math_exp2f16",
     ],
 )