libsanitizer: Merge with upstream

Merged revision: 1c2e5fd66ea27d0c51360ba4e22099124a915562
author: H.J. Lu <hjl.tools@gmail.com> 2021-09-27 10:43:33 -0700
committer: H.J. Lu <hjl.tools@gmail.com> 2021-10-01 09:02:54 -0700
commit: 76288e1c5da5a34e3c13d37ac4cab41e0f46ff61 (patch)
tree: 91841423d03755f702c6a60401338e06c08c8017 /libsanitizer/tsan
parent: 7c99923f8c544ec07109e8333acb2c2388c38a1b (diff)
download: gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.zip
gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.tar.gz
gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.tar.bz2
53 files changed, 3515 insertions, 3130 deletions
diff --git a/libsanitizer/tsan/Makefile.am b/libsanitizer/tsan/Makefile.am
index dcb247e..9dc11f7 100644
--- a/libsanitizer/tsan/Makefile.am
+++ b/libsanitizer/tsan/Makefile.am
@@ -29,7 +29,6 @@ tsan_files = \
 	tsan_malloc_mac.cpp \
 	tsan_md5.cpp \
 	tsan_mman.cpp \
-	tsan_mutex.cpp \
 	tsan_mutexset.cpp \
 	tsan_new_delete.cpp \
 	tsan_platform_linux.cpp \
@@ -45,7 +44,8 @@ tsan_files = \
 	tsan_stack_trace.cpp \
 	tsan_suppressions.cpp \
 	tsan_symbolize.cpp \
-	tsan_sync.cpp 
+	tsan_sync.cpp \
+	tsan_vector_clock.cpp
 
 libtsan_la_SOURCES = $(tsan_files)
 EXTRA_libtsan_la_SOURCES = tsan_rtl_amd64.S tsan_rtl_aarch64.S tsan_rtl_mips64.S tsan_rtl_ppc64.S tsan_rtl_s390x.S
diff --git a/libsanitizer/tsan/Makefile.in b/libsanitizer/tsan/Makefile.in
index 83617cf..921a78c 100644
--- a/libsanitizer/tsan/Makefile.in
+++ b/libsanitizer/tsan/Makefile.in
@@ -150,12 +150,13 @@ am__objects_1 = tsan_clock.lo tsan_debugging.lo tsan_external.lo \
 	tsan_interceptors_posix.lo tsan_interceptors_mac.lo \
 	tsan_interface_ann.lo tsan_interface_atomic.lo \
 	tsan_interface.lo tsan_interface_java.lo tsan_malloc_mac.lo \
-	tsan_md5.lo tsan_mman.lo tsan_mutex.lo tsan_mutexset.lo \
-	tsan_new_delete.lo tsan_platform_linux.lo tsan_platform_mac.lo \
+	tsan_md5.lo tsan_mman.lo tsan_mutexset.lo tsan_new_delete.lo \
+	tsan_platform_linux.lo tsan_platform_mac.lo \
 	tsan_platform_posix.lo tsan_platform_windows.lo tsan_report.lo \
 	tsan_rtl.lo tsan_rtl_mutex.lo tsan_rtl_proc.lo \
 	tsan_rtl_report.lo tsan_rtl_thread.lo tsan_stack_trace.lo \
-	tsan_suppressions.lo tsan_symbolize.lo tsan_sync.lo
+	tsan_suppressions.lo tsan_symbolize.lo tsan_sync.lo \
+	tsan_vector_clock.lo
 am_libtsan_la_OBJECTS = $(am__objects_1)
 libtsan_la_OBJECTS = $(am_libtsan_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -431,7 +432,6 @@ tsan_files = \
 	tsan_malloc_mac.cpp \
 	tsan_md5.cpp \
 	tsan_mman.cpp \
-	tsan_mutex.cpp \
 	tsan_mutexset.cpp \
 	tsan_new_delete.cpp \
 	tsan_platform_linux.cpp \
@@ -447,7 +447,8 @@ tsan_files = \
 	tsan_stack_trace.cpp \
 	tsan_suppressions.cpp \
 	tsan_symbolize.cpp \
-	tsan_sync.cpp 
+	tsan_sync.cpp \
+	tsan_vector_clock.cpp
 
 libtsan_la_SOURCES = $(tsan_files)
 EXTRA_libtsan_la_SOURCES = tsan_rtl_amd64.S tsan_rtl_aarch64.S tsan_rtl_mips64.S tsan_rtl_ppc64.S tsan_rtl_s390x.S
@@ -594,7 +595,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_malloc_mac.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_md5.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_mman.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_mutex.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_mutexset.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_new_delete.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_platform_linux.Plo@am__quote@
@@ -616,6 +616,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_suppressions.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_symbolize.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_sync.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_vector_clock.Plo@am__quote@
 
 .S.o:
 @am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
diff --git a/libsanitizer/tsan/tsan_clock.cpp b/libsanitizer/tsan/tsan_clock.cpp
index 61848c2..d122b67 100644
--- a/libsanitizer/tsan/tsan_clock.cpp
+++ b/libsanitizer/tsan/tsan_clock.cpp
@@ -72,9 +72,9 @@
 // clk_ - variable size vector clock, low kClkBits hold timestamp,
 //   the remaining bits hold "acquired" flag (the actual value is thread's
 //   reused counter);
-//   if acquried == thr->reused_, then the respective thread has already
+//   if acquired == thr->reused_, then the respective thread has already
 //   acquired this clock (except possibly for dirty elements).
-// dirty_ - holds up to two indeces in the vector clock that other threads
+// dirty_ - holds up to two indices in the vector clock that other threads
 //   need to acquire regardless of "acquired" flag value;
 // release_store_tid_ - denotes that the clock state is a result of
 //   release-store operation by the thread with release_store_tid_ index.
@@ -272,7 +272,7 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
     // we could update the existing clock and cache it, or replace it with the
     // currently cached clock and release the old one. And for a shared
     // existing clock, we could replace it with the currently cached;
-    // or unshare, update and cache. But, for simplicity, we currnetly reuse
+    // or unshare, update and cache. But, for simplicity, we currently reuse
     // cached clock only when the target clock is empty.
     dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
     dst->tab_idx_ = cached_idx_;
@@ -285,7 +285,7 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
     dst->dirty_[0].epoch = clk_[tid_];
     dst->release_store_tid_ = tid_;
     dst->release_store_reused_ = reused_;
-    // Rememeber that we don't need to acquire it in future.
+    // Remember that we don't need to acquire it in future.
     dst->elem(tid_).reused = reused_;
     // Grab a reference.
     atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
@@ -316,7 +316,7 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
   for (uptr i = 0; i < kDirtyTids; i++) dst->dirty_[i].set_tid(kInvalidTid);
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
-  // Rememeber that we don't need to acquire it in future.
+  // Remember that we don't need to acquire it in future.
   dst->elem(tid_).reused = reused_;
 
   // If the resulting clock is cachable, cache it for future release operations.
diff --git a/libsanitizer/tsan/tsan_clock.h b/libsanitizer/tsan/tsan_clock.h
index 31376a1..11cbc0c 100644
--- a/libsanitizer/tsan/tsan_clock.h
+++ b/libsanitizer/tsan/tsan_clock.h
@@ -213,7 +213,7 @@ class ThreadClock {
   // We reuse it for subsequent store-release operations without intervening
   // acquire operations. Since it is shared (and thus constant), clock value
   // for the current thread is then stored in dirty entries in the SyncClock.
-  // We host a refernece to the table while it is cached here.
+  // We host a reference to the table while it is cached here.
   u32 cached_idx_;
   u16 cached_size_;
   u16 cached_blocks_;
diff --git a/libsanitizer/tsan/tsan_debugging.cpp b/libsanitizer/tsan/tsan_debugging.cpp
index d3d6255..1d3c384 100644
--- a/libsanitizer/tsan/tsan_debugging.cpp
+++ b/libsanitizer/tsan/tsan_debugging.cpp
@@ -195,9 +195,9 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
   const char *region_kind = nullptr;
   if (name && name_size > 0) name[0] = 0;
 
-  if (IsMetaMem(addr)) {
+  if (IsMetaMem(reinterpret_cast<u32 *>(addr))) {
     region_kind = "meta shadow";
-  } else if (IsShadowMem(addr)) {
+  } else if (IsShadowMem(reinterpret_cast<RawShadow *>(addr))) {
     region_kind = "shadow";
   } else {
     bool is_stack = false;
@@ -215,9 +215,9 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
     } else {
       // TODO(kuba.brecka): We should not lock. This is supposed to be called
       // from within the debugger when other threads are stopped.
-      ctx->thread_registry->Lock();
+      ctx->thread_registry.Lock();
       ThreadContext *tctx = IsThreadStackOrTls(addr, &is_stack);
-      ctx->thread_registry->Unlock();
+      ctx->thread_registry.Unlock();
       if (tctx) {
         region_kind = is_stack ? "stack" : "tls";
       } else {
@@ -252,7 +252,7 @@ int __tsan_get_alloc_stack(uptr addr, uptr *trace, uptr size, int *thread_id,
   *thread_id = b->tid;
   // No locking.  This is supposed to be called from within the debugger when
   // other threads are stopped.
-  ThreadContextBase *tctx = ctx->thread_registry->GetThreadLocked(b->tid);
+  ThreadContextBase *tctx = ctx->thread_registry.GetThreadLocked(b->tid);
   *os_id = tctx->os_id;
 
   StackTrace stack = StackDepotGet(b->stk);
diff --git a/libsanitizer/tsan/tsan_defs.h b/libsanitizer/tsan/tsan_defs.h
index 5c8f280..fe0c1da 100644
--- a/libsanitizer/tsan/tsan_defs.h
+++ b/libsanitizer/tsan/tsan_defs.h
@@ -15,8 +15,27 @@
 
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_mutex.h"
 #include "ubsan/ubsan_platform.h"
 
+#ifndef TSAN_VECTORIZE
+#  define TSAN_VECTORIZE __SSE4_2__
+#endif
+
+#if TSAN_VECTORIZE
+// <emmintrin.h> transitively includes <stdlib.h>,
+// and it's prohibited to include std headers into tsan runtime.
+// So we do this dirty trick.
+#  define _MM_MALLOC_H_INCLUDED
+#  define __MM_MALLOC_H
+#  include <emmintrin.h>
+#  include <smmintrin.h>
+#  define VECTOR_ALIGNED ALIGNED(16)
+typedef __m128i m128;
+#else
+#  define VECTOR_ALIGNED
+#endif
+
 // Setup defaults for compile definitions.
 #ifndef TSAN_NO_HISTORY
 # define TSAN_NO_HISTORY 0
@@ -32,6 +51,19 @@
 
 namespace __tsan {
 
+constexpr uptr kByteBits = 8;
+
+// Thread slot ID.
+enum class Sid : u8 {};
+constexpr uptr kThreadSlotCount = 256;
+constexpr Sid kFreeSid = static_cast<Sid>(255);
+
+// Abstract time unit, vector clock element.
+enum class Epoch : u16 {};
+constexpr uptr kEpochBits = 14;
+constexpr Epoch kEpochZero = static_cast<Epoch>(0);
+constexpr Epoch kEpochOver = static_cast<Epoch>(1 << kEpochBits);
+
 const int kClkBits = 42;
 const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
 
@@ -74,8 +106,9 @@ const uptr kShadowCnt = 4;
 // That many user bytes are mapped onto a single shadow cell.
 const uptr kShadowCell = 8;
 
-// Size of a single shadow value (u64).
-const uptr kShadowSize = 8;
+// Single shadow value.
+typedef u64 RawShadow;
+const uptr kShadowSize = sizeof(RawShadow);
 
 // Shadow memory is kShadowMultiplier times larger than user memory.
 const uptr kShadowMultiplier = kShadowSize * kShadowCnt / kShadowCell;
@@ -87,6 +120,9 @@ const uptr kMetaShadowCell = 8;
 // Size of a single meta shadow value (u32).
 const uptr kMetaShadowSize = 4;
 
+// All addresses and PCs are assumed to be compressable to that many bits.
+const uptr kCompressedAddrBits = 44;
+
 #if TSAN_NO_HISTORY
 const bool kCollectHistory = false;
 #else
@@ -153,12 +189,23 @@ struct ReportStack;
 class ReportDesc;
 class RegionAlloc;
 
+typedef uptr AccessType;
+
+enum : AccessType {
+  kAccessWrite = 0,
+  kAccessRead = 1 << 0,
+  kAccessAtomic = 1 << 1,
+  kAccessVptr = 1 << 2,  // read or write of an object virtual table pointer
+  kAccessFree = 1 << 3,  // synthetic memory access during memory freeing
+  kAccessExternalPC = 1 << 4,  // access PC can have kExternalPCBit set
+};
+
 // Descriptor of user's memory block.
 struct MBlock {
   u64  siz : 48;
   u64  tag : 16;
-  u32  stk;
-  u16  tid;
+  StackID stk;
+  Tid tid;
 };
 
 COMPILER_CHECK(sizeof(MBlock) == 16);
@@ -172,6 +219,17 @@ enum ExternalTag : uptr {
   // as 16-bit values, see tsan_defs.h.
 };
 
+enum MutexType {
+  MutexTypeTrace = MutexLastCommon,
+  MutexTypeReport,
+  MutexTypeSyncVar,
+  MutexTypeAnnotations,
+  MutexTypeAtExit,
+  MutexTypeFired,
+  MutexTypeRacy,
+  MutexTypeGlobalProc,
+};
+
 }  // namespace __tsan
 
 #endif  // TSAN_DEFS_H
diff --git a/libsanitizer/tsan/tsan_dense_alloc.h b/libsanitizer/tsan/tsan_dense_alloc.h
index 6c89e40..9e15f74 100644
--- a/libsanitizer/tsan/tsan_dense_alloc.h
+++ b/libsanitizer/tsan/tsan_dense_alloc.h
@@ -20,7 +20,6 @@
 
 #include "sanitizer_common/sanitizer_common.h"
 #include "tsan_defs.h"
-#include "tsan_mutex.h"
 
 namespace __tsan {
 
@@ -50,11 +49,7 @@ class DenseSlabAlloc {
   static_assert(sizeof(T) > sizeof(IndexT),
                 "it doesn't make sense to use dense alloc");
 
-  explicit DenseSlabAlloc(LinkerInitialized, const char *name) {
-    freelist_ = 0;
-    fillpos_ = 0;
-    name_ = name;
-  }
+  DenseSlabAlloc(LinkerInitialized, const char *name) : name_(name) {}
 
   explicit DenseSlabAlloc(const char *name)
       : DenseSlabAlloc(LINKER_INITIALIZED, name) {
@@ -90,6 +85,8 @@ class DenseSlabAlloc {
   }
 
   void FlushCache(Cache *c) {
+    if (!c->pos)
+      return;
     SpinMutexLock lock(&mtx_);
     while (c->pos) {
       IndexT idx = c->cache[--c->pos];
@@ -103,33 +100,39 @@ class DenseSlabAlloc {
     internal_memset(c->cache, 0, sizeof(c->cache));
   }
 
+  uptr AllocatedMemory() const {
+    return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);
+  }
+
  private:
   T *map_[kL1Size];
   SpinMutex mtx_;
-  IndexT freelist_;
-  uptr fillpos_;
-  const char *name_;
+  IndexT freelist_ = {0};
+  atomic_uintptr_t fillpos_ = {0};
+  const char *const name_;
 
   void Refill(Cache *c) {
     SpinMutexLock lock(&mtx_);
     if (freelist_ == 0) {
-      if (fillpos_ == kL1Size) {
+      uptr fillpos = atomic_load_relaxed(&fillpos_);
+      if (fillpos == kL1Size) {
         Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
             name_, kL1Size, kL2Size);
         Die();
       }
-      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n",
-          name_, fillpos_, kL1Size, kL2Size);
+      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
+              fillpos, kL1Size, kL2Size);
       T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
       // Reserve 0 as invalid index.
-      IndexT start = fillpos_ == 0 ? 1 : 0;
+      IndexT start = fillpos == 0 ? 1 : 0;
       for (IndexT i = start; i < kL2Size; i++) {
         new(batch + i) T;
-        *(IndexT*)(batch + i) = i + 1 + fillpos_ * kL2Size;
+        *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
       }
       *(IndexT*)(batch + kL2Size - 1) = 0;
-      freelist_ = fillpos_ * kL2Size + start;
-      map_[fillpos_++] = batch;
+      freelist_ = fillpos * kL2Size + start;
+      map_[fillpos] = batch;
+      atomic_store_relaxed(&fillpos_, fillpos + 1);
     }
     for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
       IndexT idx = freelist_;
diff --git a/libsanitizer/tsan/tsan_external.cpp b/libsanitizer/tsan/tsan_external.cpp
index a87e12f..19ae174 100644
--- a/libsanitizer/tsan/tsan_external.cpp
+++ b/libsanitizer/tsan/tsan_external.cpp
@@ -10,9 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 #include "tsan_rtl.h"
-#include "tsan_interceptors.h"
 #include "sanitizer_common/sanitizer_ptrauth.h"
 
+#if !SANITIZER_GO
+#  include "tsan_interceptors.h"
+#endif
+
 namespace __tsan {
 
 #define CALLERPC ((uptr)__builtin_return_address(0))
@@ -57,16 +60,14 @@ uptr TagFromShadowStackFrame(uptr pc) {
 
 #if !SANITIZER_GO
 
-typedef void(*AccessFunc)(ThreadState *, uptr, uptr, int);
-void ExternalAccess(void *addr, uptr caller_pc, void *tag, AccessFunc access) {
+void ExternalAccess(void *addr, uptr caller_pc, void *tag, AccessType typ) {
   CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed));
   ThreadState *thr = cur_thread();
   if (caller_pc) FuncEntry(thr, caller_pc);
   InsertShadowStackFrameForTag(thr, (uptr)tag);
   bool in_ignored_lib;
-  if (!caller_pc || !libignore()->IsIgnored(caller_pc, &in_ignored_lib)) {
-    access(thr, CALLERPC, (uptr)addr, kSizeLog1);
-  }
+  if (!caller_pc || !libignore()->IsIgnored(caller_pc, &in_ignored_lib))
+    MemoryAccess(thr, CALLERPC, (uptr)addr, 1, typ);
   FuncExit(thr);
   if (caller_pc) FuncExit(thr);
 }
@@ -92,7 +93,7 @@ void __tsan_external_register_header(void *tag, const char *header) {
   header = internal_strdup(header);
   char *old_header =
       (char *)atomic_exchange(header_ptr, (uptr)header, memory_order_seq_cst);
-  if (old_header) internal_free(old_header);
+  Free(old_header);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -111,12 +112,12 @@ void __tsan_external_assign_tag(void *addr, void *tag) {
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_read(void *addr, void *caller_pc, void *tag) {
-  ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryRead);
+  ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, kAccessRead);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_write(void *addr, void *caller_pc, void *tag) {
-  ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryWrite);
+  ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, kAccessWrite);
 }
 }  // extern "C"
 
diff --git a/libsanitizer/tsan/tsan_fd.cpp b/libsanitizer/tsan/tsan_fd.cpp
index 50a6b56..255ffa8 100644
--- a/libsanitizer/tsan/tsan_fd.cpp
+++ b/libsanitizer/tsan/tsan_fd.cpp
@@ -26,8 +26,8 @@ struct FdSync {
 
 struct FdDesc {
   FdSync *sync;
-  int creation_tid;
-  u32 creation_stack;
+  Tid creation_tid;
+  StackID creation_stack;
 };
 
 struct FdContext {
@@ -115,7 +115,7 @@ static void init(ThreadState *thr, uptr pc, int fd, FdSync *s,
     MemoryRangeImitateWrite(thr, pc, (uptr)d, 8);
   } else {
     // See the dup-related comment in FdClose.
-    MemoryRead(thr, pc, (uptr)d, kSizeLog8);
+    MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
   }
 }
 
@@ -140,7 +140,7 @@ void FdOnFork(ThreadState *thr, uptr pc) {
   }
 }
 
-bool FdLocation(uptr addr, int *fd, int *tid, u32 *stack) {
+bool FdLocation(uptr addr, int *fd, Tid *tid, StackID *stack) {
   for (int l1 = 0; l1 < kTableSizeL1; l1++) {
     FdDesc *tab = (FdDesc*)atomic_load(&fdctx.tab[l1], memory_order_relaxed);
     if (tab == 0)
@@ -163,7 +163,7 @@ void FdAcquire(ThreadState *thr, uptr pc, int fd) {
   FdDesc *d = fddesc(thr, pc, fd);
   FdSync *s = d->sync;
   DPrintf("#%d: FdAcquire(%d) -> %p\n", thr->tid, fd, s);
-  MemoryRead(thr, pc, (uptr)d, kSizeLog8);
+  MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
   if (s)
     Acquire(thr, pc, (uptr)s);
 }
@@ -174,7 +174,7 @@ void FdRelease(ThreadState *thr, uptr pc, int fd) {
   FdDesc *d = fddesc(thr, pc, fd);
   FdSync *s = d->sync;
   DPrintf("#%d: FdRelease(%d) -> %p\n", thr->tid, fd, s);
-  MemoryRead(thr, pc, (uptr)d, kSizeLog8);
+  MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
   if (s)
     Release(thr, pc, (uptr)s);
 }
@@ -184,7 +184,7 @@ void FdAccess(ThreadState *thr, uptr pc, int fd) {
   if (bogusfd(fd))
     return;
   FdDesc *d = fddesc(thr, pc, fd);
-  MemoryRead(thr, pc, (uptr)d, kSizeLog8);
+  MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
 }
 
 void FdClose(ThreadState *thr, uptr pc, int fd, bool write) {
@@ -194,7 +194,7 @@ void FdClose(ThreadState *thr, uptr pc, int fd, bool write) {
   FdDesc *d = fddesc(thr, pc, fd);
   if (write) {
     // To catch races between fd usage and close.
-    MemoryWrite(thr, pc, (uptr)d, kSizeLog8);
+    MemoryAccess(thr, pc, (uptr)d, 8, kAccessWrite);
   } else {
     // This path is used only by dup2/dup3 calls.
     // We do read instead of write because there is a number of legitimate
@@ -204,15 +204,15 @@ void FdClose(ThreadState *thr, uptr pc, int fd, bool write) {
     // 2. Some daemons dup /dev/null in place of stdin/stdout.
     // On the other hand we have not seen cases when write here catches real
     // bugs.
-    MemoryRead(thr, pc, (uptr)d, kSizeLog8);
+    MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
   }
   // We need to clear it, because if we do not intercept any call out there
   // that creates fd, we will hit false postives.
   MemoryResetRange(thr, pc, (uptr)d, 8);
   unref(thr, pc, d->sync);
   d->sync = 0;
-  d->creation_tid = 0;
-  d->creation_stack = 0;
+  d->creation_tid = kInvalidTid;
+  d->creation_stack = kInvalidStackID;
 }
 
 void FdFileCreate(ThreadState *thr, uptr pc, int fd) {
@@ -228,7 +228,7 @@ void FdDup(ThreadState *thr, uptr pc, int oldfd, int newfd, bool write) {
     return;
   // Ignore the case when user dups not yet connected socket.
   FdDesc *od = fddesc(thr, pc, oldfd);
-  MemoryRead(thr, pc, (uptr)od, kSizeLog8);
+  MemoryAccess(thr, pc, (uptr)od, 8, kAccessRead);
   FdClose(thr, pc, newfd, write);
   init(thr, pc, newfd, ref(od->sync), write);
 }
diff --git a/libsanitizer/tsan/tsan_fd.h b/libsanitizer/tsan/tsan_fd.h
index ce4f2f7..d964817 100644
--- a/libsanitizer/tsan/tsan_fd.h
+++ b/libsanitizer/tsan/tsan_fd.h
@@ -53,7 +53,7 @@ void FdSocketCreate(ThreadState *thr, uptr pc, int fd);
 void FdSocketAccept(ThreadState *thr, uptr pc, int fd, int newfd);
 void FdSocketConnecting(ThreadState *thr, uptr pc, int fd);
 void FdSocketConnect(ThreadState *thr, uptr pc, int fd);
-bool FdLocation(uptr addr, int *fd, int *tid, u32 *stack);
+bool FdLocation(uptr addr, int *fd, Tid *tid, StackID *stack);
 void FdOnFork(ThreadState *thr, uptr pc);
 
 uptr File2addr(const char *path);
diff --git a/libsanitizer/tsan/tsan_flags.cpp b/libsanitizer/tsan/tsan_flags.cpp
index 49e4a9c..ee89862 100644
--- a/libsanitizer/tsan/tsan_flags.cpp
+++ b/libsanitizer/tsan/tsan_flags.cpp
@@ -55,6 +55,7 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) {
     // Override some common flags defaults.
     CommonFlags cf;
     cf.CopyFrom(*common_flags());
+    cf.external_symbolizer_path = GetEnv("TSAN_SYMBOLIZER_PATH");
     cf.allow_addr2line = true;
     if (SANITIZER_GO) {
       // Does not work as expected for Go: runtime handles SIGABRT and crashes.
diff --git a/libsanitizer/tsan/tsan_flags.inc b/libsanitizer/tsan/tsan_flags.inc
index 2105c75..7954a430 100644
--- a/libsanitizer/tsan/tsan_flags.inc
+++ b/libsanitizer/tsan/tsan_flags.inc
@@ -43,7 +43,6 @@ TSAN_FLAG(
     bool, force_seq_cst_atomics, false,
     "If set, all atomics are effectively sequentially consistent (seq_cst), "
     "regardless of what user actually specified.")
-TSAN_FLAG(bool, print_benign, false, "Print matched \"benign\" races at exit.")
 TSAN_FLAG(bool, halt_on_error, false, "Exit after first reported error.")
 TSAN_FLAG(int, atexit_sleep_ms, 1000,
           "Sleep in main thread before exiting for that many ms "
diff --git a/libsanitizer/tsan/tsan_ignoreset.cpp b/libsanitizer/tsan/tsan_ignoreset.cpp
index f6e41f6..1fca1cf 100644
--- a/libsanitizer/tsan/tsan_ignoreset.cpp
+++ b/libsanitizer/tsan/tsan_ignoreset.cpp
@@ -19,7 +19,7 @@ IgnoreSet::IgnoreSet()
     : size_() {
 }
 
-void IgnoreSet::Add(u32 stack_id) {
+void IgnoreSet::Add(StackID stack_id) {
   if (size_ == kMaxSize)
     return;
   for (uptr i = 0; i < size_; i++) {
@@ -29,15 +29,7 @@ void IgnoreSet::Add(u32 stack_id) {
   stacks_[size_++] = stack_id;
 }
 
-void IgnoreSet::Reset() {
-  size_ = 0;
-}
-
-uptr IgnoreSet::Size() const {
-  return size_;
-}
-
-u32 IgnoreSet::At(uptr i) const {
+StackID IgnoreSet::At(uptr i) const {
   CHECK_LT(i, size_);
   CHECK_LE(size_, kMaxSize);
   return stacks_[i];
diff --git a/libsanitizer/tsan/tsan_ignoreset.h b/libsanitizer/tsan/tsan_ignoreset.h
index 3e318bd..4e25112 100644
--- a/libsanitizer/tsan/tsan_ignoreset.h
+++ b/libsanitizer/tsan/tsan_ignoreset.h
@@ -19,17 +19,16 @@ namespace __tsan {
 
 class IgnoreSet {
  public:
-  static const uptr kMaxSize = 16;
-
   IgnoreSet();
-  void Add(u32 stack_id);
-  void Reset();
-  uptr Size() const;
-  u32 At(uptr i) const;
+  void Add(StackID stack_id);
+  void Reset() { size_ = 0; }
+  uptr Size() const { return size_; }
+  StackID At(uptr i) const;
 
  private:
+  static constexpr uptr kMaxSize = 16;
   uptr size_;
-  u32 stacks_[kMaxSize];
+  StackID stacks_[kMaxSize];
 };
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_ilist.h b/libsanitizer/tsan/tsan_ilist.h
new file mode 100644
index 0000000..d7d8be2
--- /dev/null
+++ b/libsanitizer/tsan/tsan_ilist.h
@@ -0,0 +1,189 @@
+//===-- tsan_ilist.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_ILIST_H
+#define TSAN_ILIST_H
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __tsan {
+
+class INode {
+ public:
+  INode() = default;
+
+ private:
+  INode* next_ = nullptr;
+  INode* prev_ = nullptr;
+
+  template <typename Base, INode Base::*Node, typename Elem>
+  friend class IList;
+  INode(const INode&) = delete;
+  void operator=(const INode&) = delete;
+};
+
+// Intrusive doubly-linked list.
+//
+// The node class (MyNode) needs to include "INode foo" field,
+// then the list can be declared as IList<MyNode, &MyNode::foo>.
+// This design allows to link MyNode into multiple lists using
+// different INode fields.
+// The optional Elem template argument allows to specify node MDT
+// (most derived type) if it's different from MyNode.
+template <typename Base, INode Base::*Node, typename Elem = Base>
+class IList {
+ public:
+  IList();
+
+  void PushFront(Elem* e);
+  void PushBack(Elem* e);
+  void Remove(Elem* e);
+
+  Elem* PopFront();
+  Elem* PopBack();
+  Elem* Front();
+  Elem* Back();
+
+  // Prev links point towards front of the queue.
+  Elem* Prev(Elem* e);
+  // Next links point towards back of the queue.
+  Elem* Next(Elem* e);
+
+  uptr Size() const;
+  bool Empty() const;
+  bool Queued(Elem* e) const;
+
+ private:
+  INode node_;
+  uptr size_ = 0;
+
+  void Push(Elem* e, INode* after);
+  static INode* ToNode(Elem* e);
+  static Elem* ToElem(INode* n);
+
+  IList(const IList&) = delete;
+  void operator=(const IList&) = delete;
+};
+
+template <typename Base, INode Base::*Node, typename Elem>
+IList<Base, Node, Elem>::IList() {
+  node_.next_ = node_.prev_ = &node_;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+void IList<Base, Node, Elem>::PushFront(Elem* e) {
+  Push(e, &node_);
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+void IList<Base, Node, Elem>::PushBack(Elem* e) {
+  Push(e, node_.prev_);
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+void IList<Base, Node, Elem>::Push(Elem* e, INode* after) {
+  INode* n = ToNode(e);
+  DCHECK_EQ(n->next_, nullptr);
+  DCHECK_EQ(n->prev_, nullptr);
+  INode* next = after->next_;
+  n->next_ = next;
+  n->prev_ = after;
+  next->prev_ = n;
+  after->next_ = n;
+  size_++;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+void IList<Base, Node, Elem>::Remove(Elem* e) {
+  INode* n = ToNode(e);
+  INode* next = n->next_;
+  INode* prev = n->prev_;
+  DCHECK(next);
+  DCHECK(prev);
+  DCHECK(size_);
+  next->prev_ = prev;
+  prev->next_ = next;
+  n->prev_ = n->next_ = nullptr;
+  size_--;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::PopFront() {
+  Elem* e = Front();
+  if (e)
+    Remove(e);
+  return e;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::PopBack() {
+  Elem* e = Back();
+  if (e)
+    Remove(e);
+  return e;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::Front() {
+  return size_ ? ToElem(node_.next_) : nullptr;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::Back() {
+  return size_ ? ToElem(node_.prev_) : nullptr;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::Prev(Elem* e) {
+  INode* n = ToNode(e);
+  DCHECK(n->prev_);
+  return n->prev_ != &node_ ? ToElem(n->prev_) : nullptr;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::Next(Elem* e) {
+  INode* n = ToNode(e);
+  DCHECK(n->next_);
+  return n->next_ != &node_ ? ToElem(n->next_) : nullptr;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+uptr IList<Base, Node, Elem>::Size() const {
+  return size_;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+bool IList<Base, Node, Elem>::Empty() const {
+  return size_ == 0;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+bool IList<Base, Node, Elem>::Queued(Elem* e) const {
+  INode* n = ToNode(e);
+  DCHECK_EQ(!n->next_, !n->prev_);
+  return n->next_;
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+INode* IList<Base, Node, Elem>::ToNode(Elem* e) {
+  return &(e->*Node);
+}
+
+template <typename Base, INode Base::*Node, typename Elem>
+Elem* IList<Base, Node, Elem>::ToElem(INode* n) {
+  return static_cast<Elem*>(reinterpret_cast<Base*>(
+      reinterpret_cast<uptr>(n) -
+      reinterpret_cast<uptr>(&(reinterpret_cast<Elem*>(0)->*Node))));
+}
+
+}  // namespace __tsan
+
+#endif
diff --git a/libsanitizer/tsan/tsan_interceptors.h b/libsanitizer/tsan/tsan_interceptors.h
index c5716f5..a855d1d 100644
--- a/libsanitizer/tsan/tsan_interceptors.h
+++ b/libsanitizer/tsan/tsan_interceptors.h
@@ -10,13 +10,22 @@ class ScopedInterceptor {
  public:
   ScopedInterceptor(ThreadState *thr, const char *fname, uptr pc);
   ~ScopedInterceptor();
-  void DisableIgnores();
-  void EnableIgnores();
+  void DisableIgnores() {
+    if (UNLIKELY(ignoring_))
+      DisableIgnoresImpl();
+  }
+  void EnableIgnores() {
+    if (UNLIKELY(ignoring_))
+      EnableIgnoresImpl();
+  }
+
  private:
   ThreadState *const thr_;
-  const uptr pc_;
   bool in_ignored_lib_;
   bool ignoring_;
+
+  void DisableIgnoresImpl();
+  void EnableIgnoresImpl();
 };
 
 LibIgnore *libignore();
@@ -36,18 +45,16 @@ inline bool in_symbolizer() {
   const uptr caller_pc = GET_CALLER_PC();      \
   ScopedInterceptor si(thr, #func, caller_pc); \
   const uptr pc = GET_CURRENT_PC();            \
-  (void)pc;                                    \
-  /**/
+  (void)pc;
 
-#define SCOPED_TSAN_INTERCEPTOR(func, ...) \
-    SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__); \
-    if (REAL(func) == 0) { \
-      Report("FATAL: ThreadSanitizer: failed to intercept %s\n", #func); \
-      Die(); \
-    }                                                    \
-    if (!thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib) \
-      return REAL(func)(__VA_ARGS__); \
-/**/
+#define SCOPED_TSAN_INTERCEPTOR(func, ...)                                \
+  SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__);                              \
+  if (REAL(func) == 0) {                                                  \
+    Report("FATAL: ThreadSanitizer: failed to intercept %s\n", #func);    \
+    Die();                                                                \
+  }                                                                       \
+  if (!thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib) \
+    return REAL(func)(__VA_ARGS__);
 
 #define SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START() \
     si.DisableIgnores();
diff --git a/libsanitizer/tsan/tsan_interceptors_mac.cpp b/libsanitizer/tsan/tsan_interceptors_mac.cpp
index 2d400c7..ed06415 100644
--- a/libsanitizer/tsan/tsan_interceptors_mac.cpp
+++ b/libsanitizer/tsan/tsan_interceptors_mac.cpp
@@ -365,7 +365,7 @@ static uptr GetOrCreateSyncAddress(uptr addr, ThreadState *thr, uptr pc) {
   if (h.created()) {
     ThreadIgnoreBegin(thr, pc);
     *h = (uptr) user_alloc(thr, pc, /*size=*/1);
-    ThreadIgnoreEnd(thr, pc);
+    ThreadIgnoreEnd(thr);
   }
   return *h;
 }
@@ -405,8 +405,8 @@ TSAN_INTERCEPTOR(int, swapcontext, ucontext_t *oucp, const ucontext_t *ucp) {
   {
     SCOPED_INTERCEPTOR_RAW(swapcontext, oucp, ucp);
   }
-  // Bacause of swapcontext() semantics we have no option but to copy its
-  // impementation here
+  // Because of swapcontext() semantics we have no option but to copy its
+  // implementation here
   if (!oucp || !ucp) {
     errno = EINVAL;
     return -1;
diff --git a/libsanitizer/tsan/tsan_interceptors_posix.cpp b/libsanitizer/tsan/tsan_interceptors_posix.cpp
index 6808f2e..d3e4c8f 100644
--- a/libsanitizer/tsan/tsan_interceptors_posix.cpp
+++ b/libsanitizer/tsan/tsan_interceptors_posix.cpp
@@ -96,9 +96,6 @@ extern "C" void _exit(int status);
 extern "C" int fileno_unlocked(void *stream);
 extern "C" int dirfd(void *dirp);
 #endif
-#if SANITIZER_GLIBC
-extern "C" int mallopt(int param, int value);
-#endif
 #if SANITIZER_NETBSD
 extern __sanitizer_FILE __sF[];
 #else
@@ -161,7 +158,6 @@ const int SIG_SETMASK = 2;
 namespace __tsan {
 struct SignalDesc {
   bool armed;
-  bool sigaction;
   __sanitizer_siginfo siginfo;
   ucontext_t ctx;
 };
@@ -169,7 +165,6 @@ struct SignalDesc {
 struct ThreadSignalContext {
   int int_signal_send;
   atomic_uintptr_t in_blocking_func;
-  atomic_uintptr_t have_pending_signals;
   SignalDesc pending_signals[kSigCount];
   // emptyset and oldset are too big for stack.
   __sanitizer_sigset_t emptyset;
@@ -196,12 +191,10 @@ struct InterceptorContext {
   unsigned finalize_key;
 #endif
 
-  BlockingMutex atexit_mu;
+  Mutex atexit_mu;
   Vector<struct AtExitCtx *> AtExitStack;
 
-  InterceptorContext()
-      : libignore(LINKER_INITIALIZED), AtExitStack() {
-  }
+  InterceptorContext() : libignore(LINKER_INITIALIZED), atexit_mu(MutexTypeAtExit), AtExitStack() {}
 };
 
 static ALIGNED(64) char interceptor_placeholder[sizeof(InterceptorContext)];
@@ -250,8 +243,8 @@ static ThreadSignalContext *SigCtx(ThreadState *thr) {
 
 ScopedInterceptor::ScopedInterceptor(ThreadState *thr, const char *fname,
                                      uptr pc)
-    : thr_(thr), pc_(pc), in_ignored_lib_(false), ignoring_(false) {
-  Initialize(thr);
+    : thr_(thr), in_ignored_lib_(false), ignoring_(false) {
+  LazyInitialize(thr);
   if (!thr_->is_inited) return;
   if (!thr_->ignore_interceptors) FuncEntry(thr, pc);
   DPrintf("#%d: intercept %s()\n", thr_->tid, fname);
@@ -267,29 +260,29 @@ ScopedInterceptor::~ScopedInterceptor() {
   if (!thr_->ignore_interceptors) {
     ProcessPendingSignals(thr_);
     FuncExit(thr_);
-    CheckNoLocks(thr_);
+    CheckedMutex::CheckNoLocks();
   }
 }
 
-void ScopedInterceptor::EnableIgnores() {
-  if (ignoring_) {
-    ThreadIgnoreBegin(thr_, pc_, /*save_stack=*/false);
-    if (flags()->ignore_noninstrumented_modules) thr_->suppress_reports++;
-    if (in_ignored_lib_) {
-      DCHECK(!thr_->in_ignored_lib);
-      thr_->in_ignored_lib = true;
-    }
+NOINLINE
+void ScopedInterceptor::EnableIgnoresImpl() {
+  ThreadIgnoreBegin(thr_, 0);
+  if (flags()->ignore_noninstrumented_modules)
+    thr_->suppress_reports++;
+  if (in_ignored_lib_) {
+    DCHECK(!thr_->in_ignored_lib);
+    thr_->in_ignored_lib = true;
   }
 }
 
-void ScopedInterceptor::DisableIgnores() {
-  if (ignoring_) {
-    ThreadIgnoreEnd(thr_, pc_);
-    if (flags()->ignore_noninstrumented_modules) thr_->suppress_reports--;
-    if (in_ignored_lib_) {
-      DCHECK(thr_->in_ignored_lib);
-      thr_->in_ignored_lib = false;
-    }
+NOINLINE
+void ScopedInterceptor::DisableIgnoresImpl() {
+  ThreadIgnoreEnd(thr_);
+  if (flags()->ignore_noninstrumented_modules)
+    thr_->suppress_reports--;
+  if (in_ignored_lib_) {
+    DCHECK(thr_->in_ignored_lib);
+    thr_->in_ignored_lib = false;
   }
 }
 
@@ -325,7 +318,7 @@ struct BlockingCall {
       , ctx(SigCtx(thr)) {
     for (;;) {
       atomic_store(&ctx->in_blocking_func, 1, memory_order_relaxed);
-      if (atomic_load(&ctx->have_pending_signals, memory_order_relaxed) == 0)
+      if (atomic_load(&thr->pending_signals, memory_order_relaxed) == 0)
         break;
       atomic_store(&ctx->in_blocking_func, 0, memory_order_relaxed);
       ProcessPendingSignals(thr);
@@ -377,7 +370,7 @@ static void at_exit_wrapper() {
   AtExitCtx *ctx;
   {
     // Ensure thread-safety.
-    BlockingMutexLock l(&interceptor_ctx()->atexit_mu);
+    Lock l(&interceptor_ctx()->atexit_mu);
 
     // Pop AtExitCtx from the top of the stack of callback functions
     uptr element = interceptor_ctx()->AtExitStack.Size() - 1;
@@ -387,14 +380,14 @@ static void at_exit_wrapper() {
 
   Acquire(cur_thread(), (uptr)0, (uptr)ctx);
   ((void(*)())ctx->f)();
-  InternalFree(ctx);
+  Free(ctx);
 }
 
 static void cxa_at_exit_wrapper(void *arg) {
   Acquire(cur_thread(), 0, (uptr)arg);
   AtExitCtx *ctx = (AtExitCtx*)arg;
   ((void(*)(void *arg))ctx->f)(ctx->arg);
-  InternalFree(ctx);
+  Free(ctx);
 }
 
 static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
@@ -420,7 +413,7 @@ TSAN_INTERCEPTOR(int, __cxa_atexit, void (*f)(void *a), void *arg, void *dso) {
 
 static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
       void *arg, void *dso) {
-  AtExitCtx *ctx = (AtExitCtx*)InternalAlloc(sizeof(AtExitCtx));
+  auto *ctx = New<AtExitCtx>();
   ctx->f = f;
   ctx->arg = arg;
   Release(thr, pc, (uptr)ctx);
@@ -433,7 +426,10 @@ static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
     // Store ctx in a local stack-like structure
 
     // Ensure thread-safety.
-    BlockingMutexLock l(&interceptor_ctx()->atexit_mu);
+    Lock l(&interceptor_ctx()->atexit_mu);
+    // __cxa_atexit calls calloc. If we don't ignore interceptors, we will fail
+    // due to atexit_mu held on exit from the calloc interceptor.
+    ScopedIgnoreInterceptors ignore;
 
     res = REAL(__cxa_atexit)((void (*)(void *a))at_exit_wrapper, 0, 0);
     // Push AtExitCtx on the top of the stack of callback functions
@@ -443,7 +439,7 @@ static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
   } else {
     res = REAL(__cxa_atexit)(cxa_at_exit_wrapper, ctx, dso);
   }
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   return res;
 }
 
@@ -454,14 +450,14 @@ static void on_exit_wrapper(int status, void *arg) {
   Acquire(thr, pc, (uptr)arg);
   AtExitCtx *ctx = (AtExitCtx*)arg;
   ((void(*)(int status, void *arg))ctx->f)(status, ctx->arg);
-  InternalFree(ctx);
+  Free(ctx);
 }
 
 TSAN_INTERCEPTOR(int, on_exit, void(*f)(int, void*), void *arg) {
   if (in_symbolizer())
     return 0;
   SCOPED_TSAN_INTERCEPTOR(on_exit, f, arg);
-  AtExitCtx *ctx = (AtExitCtx*)InternalAlloc(sizeof(AtExitCtx));
+  auto *ctx = New<AtExitCtx>();
   ctx->f = (void(*)())f;
   ctx->arg = arg;
   Release(thr, pc, (uptr)ctx);
@@ -469,7 +465,7 @@ TSAN_INTERCEPTOR(int, on_exit, void(*f)(int, void*), void *arg) {
   // because we do not see synchronization around atexit callback list.
   ThreadIgnoreBegin(thr, pc);
   int res = REAL(on_exit)(on_exit_wrapper, ctx);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   return res;
 }
 #define TSAN_MAYBE_INTERCEPT_ON_EXIT TSAN_INTERCEPT(on_exit)
@@ -848,6 +844,53 @@ TSAN_INTERCEPTOR(int, posix_memalign, void **memptr, uptr align, uptr sz) {
 }
 #endif
 
+// Both __cxa_guard_acquire and pthread_once 0-initialize
+// the object initially. pthread_once does not have any
+// other ABI requirements. __cxa_guard_acquire assumes
+// that any non-0 value in the first byte means that
+// initialization is completed. Contents of the remaining
+// bytes are up to us.
+constexpr u32 kGuardInit = 0;
+constexpr u32 kGuardDone = 1;
+constexpr u32 kGuardRunning = 1 << 16;
+constexpr u32 kGuardWaiter = 1 << 17;
+
+static int guard_acquire(ThreadState *thr, uptr pc, atomic_uint32_t *g,
+                         bool blocking_hooks = true) {
+  if (blocking_hooks)
+    OnPotentiallyBlockingRegionBegin();
+  auto on_exit = at_scope_exit([blocking_hooks] {
+    if (blocking_hooks)
+      OnPotentiallyBlockingRegionEnd();
+  });
+
+  for (;;) {
+    u32 cmp = atomic_load(g, memory_order_acquire);
+    if (cmp == kGuardInit) {
+      if (atomic_compare_exchange_strong(g, &cmp, kGuardRunning,
+                                         memory_order_relaxed))
+        return 1;
+    } else if (cmp == kGuardDone) {
+      if (!thr->in_ignored_lib)
+        Acquire(thr, pc, (uptr)g);
+      return 0;
+    } else {
+      if ((cmp & kGuardWaiter) ||
+          atomic_compare_exchange_strong(g, &cmp, cmp | kGuardWaiter,
+                                         memory_order_relaxed))
+        FutexWait(g, cmp | kGuardWaiter);
+    }
+  }
+}
+
+static void guard_release(ThreadState *thr, uptr pc, atomic_uint32_t *g) {
+  if (!thr->in_ignored_lib)
+    Release(thr, pc, (uptr)g);
+  u32 old = atomic_exchange(g, kGuardDone, memory_order_release);
+  if (old & kGuardWaiter)
+    FutexWake(g, 1 << 30);
+}
+
 // __cxa_guard_acquire and friends need to be intercepted in a special way -
 // regular interceptors will break statically-linked libstdc++. Linux
 // interceptors are especially defined as weak functions (so that they don't
@@ -868,31 +911,17 @@ TSAN_INTERCEPTOR(int, posix_memalign, void **memptr, uptr align, uptr sz) {
 // Used in thread-safe function static initialization.
 STDCXX_INTERCEPTOR(int, __cxa_guard_acquire, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_acquire, g);
-  OnPotentiallyBlockingRegionBegin();
-  auto on_exit = at_scope_exit(&OnPotentiallyBlockingRegionEnd);
-  for (;;) {
-    u32 cmp = atomic_load(g, memory_order_acquire);
-    if (cmp == 0) {
-      if (atomic_compare_exchange_strong(g, &cmp, 1<<16, memory_order_relaxed))
-        return 1;
-    } else if (cmp == 1) {
-      Acquire(thr, pc, (uptr)g);
-      return 0;
-    } else {
-      internal_sched_yield();
-    }
-  }
+  return guard_acquire(thr, pc, g);
 }
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_release, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_release, g);
-  Release(thr, pc, (uptr)g);
-  atomic_store(g, 1, memory_order_release);
+  guard_release(thr, pc, g);
 }
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_abort, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_abort, g);
-  atomic_store(g, 0, memory_order_relaxed);
+  atomic_store(g, kGuardInit, memory_order_relaxed);
 }
 
 namespace __tsan {
@@ -934,14 +963,15 @@ static void thread_finalize(void *v) {
 struct ThreadParam {
   void* (*callback)(void *arg);
   void *param;
-  atomic_uintptr_t tid;
+  Tid tid;
+  Semaphore created;
+  Semaphore started;
 };
 
 extern "C" void *__tsan_thread_start_func(void *arg) {
   ThreadParam *p = (ThreadParam*)arg;
   void* (*callback)(void *arg) = p->callback;
   void *param = p->param;
-  int tid = 0;
   {
     cur_thread_init();
     ThreadState *thr = cur_thread();
@@ -954,14 +984,13 @@ extern "C" void *__tsan_thread_start_func(void *arg) {
       Printf("ThreadSanitizer: failed to set thread key\n");
       Die();
     }
-    ThreadIgnoreEnd(thr, 0);
+    ThreadIgnoreEnd(thr);
 #endif
-    while ((tid = atomic_load(&p->tid, memory_order_acquire)) == 0)
-      internal_sched_yield();
+    p->created.Wait();
     Processor *proc = ProcCreate();
     ProcWire(proc, thr);
-    ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
-    atomic_store(&p->tid, 0, memory_order_release);
+    ThreadStart(thr, p->tid, GetTid(), ThreadType::Regular);
+    p->started.Post();
   }
   void *res = callback(param);
   // Prevent the callback from being tail called,
@@ -983,9 +1012,11 @@ TSAN_INTERCEPTOR(int, pthread_create,
           "fork is not supported. Dying (set die_after_fork=0 to override)\n");
       Die();
     } else {
-      VPrintf(1, "ThreadSanitizer: starting new threads after multi-threaded "
-          "fork is not supported (pid %d). Continuing because of "
-          "die_after_fork=0, but you are on your own\n", internal_getpid());
+      VPrintf(1,
+              "ThreadSanitizer: starting new threads after multi-threaded "
+              "fork is not supported (pid %lu). Continuing because of "
+              "die_after_fork=0, but you are on your own\n",
+              internal_getpid());
     }
   }
   __sanitizer_pthread_attr_t myattr;
@@ -1000,18 +1031,18 @@ TSAN_INTERCEPTOR(int, pthread_create,
   ThreadParam p;
   p.callback = callback;
   p.param = param;
-  atomic_store(&p.tid, 0, memory_order_relaxed);
+  p.tid = kMainTid;
   int res = -1;
   {
     // Otherwise we see false positives in pthread stack manipulation.
     ScopedIgnoreInterceptors ignore;
     ThreadIgnoreBegin(thr, pc);
     res = REAL(pthread_create)(th, attr, __tsan_thread_start_func, &p);
-    ThreadIgnoreEnd(thr, pc);
+    ThreadIgnoreEnd(thr);
   }
   if (res == 0) {
-    int tid = ThreadCreate(thr, pc, *(uptr*)th, IsStateDetached(detached));
-    CHECK_NE(tid, 0);
+    p.tid = ThreadCreate(thr, pc, *(uptr *)th, IsStateDetached(detached));
+    CHECK_NE(p.tid, kMainTid);
     // Synchronization on p.tid serves two purposes:
     // 1. ThreadCreate must finish before the new thread starts.
     //    Otherwise the new thread can call pthread_detach, but the pthread_t
@@ -1019,9 +1050,8 @@ TSAN_INTERCEPTOR(int, pthread_create,
     // 2. ThreadStart must finish before this thread continues.
     //    Otherwise, this thread can call pthread_detach and reset thr->sync
     //    before the new thread got a chance to acquire from it in ThreadStart.
-    atomic_store(&p.tid, tid, memory_order_release);
-    while (atomic_load(&p.tid, memory_order_acquire) != 0)
-      internal_sched_yield();
+    p.created.Post();
+    p.started.Wait();
   }
   if (attr == &myattr)
     pthread_attr_destroy(&myattr);
@@ -1030,10 +1060,10 @@ TSAN_INTERCEPTOR(int, pthread_create,
 
 TSAN_INTERCEPTOR(int, pthread_join, void *th, void **ret) {
   SCOPED_INTERCEPTOR_RAW(pthread_join, th, ret);
-  int tid = ThreadConsumeTid(thr, pc, (uptr)th);
+  Tid tid = ThreadConsumeTid(thr, pc, (uptr)th);
   ThreadIgnoreBegin(thr, pc);
   int res = BLOCK_REAL(pthread_join)(th, ret);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   if (res == 0) {
     ThreadJoin(thr, pc, tid);
   }
@@ -1044,7 +1074,7 @@ DEFINE_REAL_PTHREAD_FUNCTIONS
 
 TSAN_INTERCEPTOR(int, pthread_detach, void *th) {
   SCOPED_INTERCEPTOR_RAW(pthread_detach, th);
-  int tid = ThreadConsumeTid(thr, pc, (uptr)th);
+  Tid tid = ThreadConsumeTid(thr, pc, (uptr)th);
   int res = REAL(pthread_detach)(th);
   if (res == 0) {
     ThreadDetach(thr, pc, tid);
@@ -1065,10 +1095,10 @@ TSAN_INTERCEPTOR(void, pthread_exit, void *retval) {
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(int, pthread_tryjoin_np, void *th, void **ret) {
   SCOPED_INTERCEPTOR_RAW(pthread_tryjoin_np, th, ret);
-  int tid = ThreadConsumeTid(thr, pc, (uptr)th);
+  Tid tid = ThreadConsumeTid(thr, pc, (uptr)th);
   ThreadIgnoreBegin(thr, pc);
   int res = REAL(pthread_tryjoin_np)(th, ret);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   if (res == 0)
     ThreadJoin(thr, pc, tid);
   else
@@ -1079,10 +1109,10 @@ TSAN_INTERCEPTOR(int, pthread_tryjoin_np, void *th, void **ret) {
 TSAN_INTERCEPTOR(int, pthread_timedjoin_np, void *th, void **ret,
                  const struct timespec *abstime) {
   SCOPED_INTERCEPTOR_RAW(pthread_timedjoin_np, th, ret, abstime);
-  int tid = ThreadConsumeTid(thr, pc, (uptr)th);
+  Tid tid = ThreadConsumeTid(thr, pc, (uptr)th);
   ThreadIgnoreBegin(thr, pc);
   int res = BLOCK_REAL(pthread_timedjoin_np)(th, ret, abstime);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   if (res == 0)
     ThreadJoin(thr, pc, tid);
   else
@@ -1446,14 +1476,14 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_unlock, void *m) {
 #if !SANITIZER_MAC
 TSAN_INTERCEPTOR(int, pthread_barrier_init, void *b, void *a, unsigned count) {
   SCOPED_TSAN_INTERCEPTOR(pthread_barrier_init, b, a, count);
-  MemoryWrite(thr, pc, (uptr)b, kSizeLog1);
+  MemoryAccess(thr, pc, (uptr)b, 1, kAccessWrite);
   int res = REAL(pthread_barrier_init)(b, a, count);
   return res;
 }
 
 TSAN_INTERCEPTOR(int, pthread_barrier_destroy, void *b) {
   SCOPED_TSAN_INTERCEPTOR(pthread_barrier_destroy, b);
-  MemoryWrite(thr, pc, (uptr)b, kSizeLog1);
+  MemoryAccess(thr, pc, (uptr)b, 1, kAccessWrite);
   int res = REAL(pthread_barrier_destroy)(b);
   return res;
 }
@@ -1461,9 +1491,9 @@ TSAN_INTERCEPTOR(int, pthread_barrier_destroy, void *b) {
 TSAN_INTERCEPTOR(int, pthread_barrier_wait, void *b) {
   SCOPED_TSAN_INTERCEPTOR(pthread_barrier_wait, b);
   Release(thr, pc, (uptr)b);
-  MemoryRead(thr, pc, (uptr)b, kSizeLog1);
+  MemoryAccess(thr, pc, (uptr)b, 1, kAccessRead);
   int res = REAL(pthread_barrier_wait)(b);
-  MemoryRead(thr, pc, (uptr)b, kSizeLog1);
+  MemoryAccess(thr, pc, (uptr)b, 1, kAccessRead);
   if (res == 0 || res == PTHREAD_BARRIER_SERIAL_THREAD) {
     Acquire(thr, pc, (uptr)b);
   }
@@ -1485,20 +1515,11 @@ TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   else
     a = static_cast<atomic_uint32_t*>(o);
 
-  u32 v = atomic_load(a, memory_order_acquire);
-  if (v == 0 && atomic_compare_exchange_strong(a, &v, 1,
-                                               memory_order_relaxed)) {
+  // Mac OS X appears to use pthread_once() where calling BlockingRegion hooks
+  // result in crashes due to too little stack space.
+  if (guard_acquire(thr, pc, a, !SANITIZER_MAC)) {
     (*f)();
-    if (!thr->in_ignored_lib)
-      Release(thr, pc, (uptr)o);
-    atomic_store(a, 2, memory_order_release);
-  } else {
-    while (v != 2) {
-      internal_sched_yield();
-      v = atomic_load(a, memory_order_acquire);
-    }
-    if (!thr->in_ignored_lib)
-      Acquire(thr, pc, (uptr)o);
+    guard_release(thr, pc, a);
   }
   return 0;
 }
@@ -1932,24 +1953,45 @@ TSAN_INTERCEPTOR(int, pthread_sigmask, int how, const __sanitizer_sigset_t *set,
 
 namespace __tsan {
 
+static void ReportErrnoSpoiling(ThreadState *thr, uptr pc) {
+  VarSizeStackTrace stack;
+  // StackTrace::GetNestInstructionPc(pc) is used because return address is
+  // expected, OutputReport() will undo this.
+  ObtainCurrentStack(thr, StackTrace::GetNextInstructionPc(pc), &stack);
+  ThreadRegistryLock l(&ctx->thread_registry);
+  ScopedReport rep(ReportTypeErrnoInSignal);
+  if (!IsFiredSuppression(ctx, ReportTypeErrnoInSignal, stack)) {
+    rep.AddStack(stack, true);
+    OutputReport(thr, rep);
+  }
+}
+
 static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
-                                  bool sigact, int sig,
-                                  __sanitizer_siginfo *info, void *uctx) {
+                                  int sig, __sanitizer_siginfo *info,
+                                  void *uctx) {
   __sanitizer_sigaction *sigactions = interceptor_ctx()->sigactions;
   if (acquire)
     Acquire(thr, 0, (uptr)&sigactions[sig]);
   // Signals are generally asynchronous, so if we receive a signals when
   // ignores are enabled we should disable ignores. This is critical for sync
-  // and interceptors, because otherwise we can miss syncronization and report
+  // and interceptors, because otherwise we can miss synchronization and report
   // false races.
   int ignore_reads_and_writes = thr->ignore_reads_and_writes;
   int ignore_interceptors = thr->ignore_interceptors;
   int ignore_sync = thr->ignore_sync;
+  // For symbolizer we only process SIGSEGVs synchronously
+  // (bug in symbolizer or in tsan). But we want to reset
+  // in_symbolizer to fail gracefully. Symbolizer and user code
+  // use different memory allocators, so if we don't reset
+  // in_symbolizer we can get memory allocated with one being
+  // feed with another, which can cause more crashes.
+  int in_symbolizer = thr->in_symbolizer;
   if (!ctx->after_multithreaded_fork) {
     thr->ignore_reads_and_writes = 0;
     thr->fast_state.ClearIgnoreBit();
     thr->ignore_interceptors = 0;
     thr->ignore_sync = 0;
+    thr->in_symbolizer = 0;
   }
   // Ensure that the handler does not spoil errno.
   const int saved_errno = errno;
@@ -1957,13 +1999,14 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
   // This code races with sigaction. Be careful to not read sa_sigaction twice.
   // Also need to remember pc for reporting before the call,
   // because the handler can reset it.
-  volatile uptr pc =
-      sigact ? (uptr)sigactions[sig].sigaction : (uptr)sigactions[sig].handler;
+  volatile uptr pc = (sigactions[sig].sa_flags & SA_SIGINFO)
+                         ? (uptr)sigactions[sig].sigaction
+                         : (uptr)sigactions[sig].handler;
   if (pc != sig_dfl && pc != sig_ign) {
-    if (sigact)
-      ((__sanitizer_sigactionhandler_ptr)pc)(sig, info, uctx);
-    else
-      ((__sanitizer_sighandler_ptr)pc)(sig);
+    // The callback can be either sa_handler or sa_sigaction.
+    // They have different signatures, but we assume that passing
+    // additional arguments to sa_handler works and is harmless.
+    ((__sanitizer_sigactionhandler_ptr)pc)(sig, info, uctx);
   }
   if (!ctx->after_multithreaded_fork) {
     thr->ignore_reads_and_writes = ignore_reads_and_writes;
@@ -1971,6 +2014,7 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
       thr->fast_state.SetIgnoreBit();
     thr->ignore_interceptors = ignore_interceptors;
     thr->ignore_sync = ignore_sync;
+    thr->in_symbolizer = in_symbolizer;
   }
   // We do not detect errno spoiling for SIGTERM,
   // because some SIGTERM handlers do spoil errno but reraise SIGTERM,
@@ -1980,27 +2024,16 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
   // from rtl_generic_sighandler) we have not yet received the reraised
   // signal; and it looks too fragile to intercept all ways to reraise a signal.
   if (ShouldReport(thr, ReportTypeErrnoInSignal) && !sync && sig != SIGTERM &&
-      errno != 99) {
-    VarSizeStackTrace stack;
-    // StackTrace::GetNestInstructionPc(pc) is used because return address is
-    // expected, OutputReport() will undo this.
-    ObtainCurrentStack(thr, StackTrace::GetNextInstructionPc(pc), &stack);
-    ThreadRegistryLock l(ctx->thread_registry);
-    ScopedReport rep(ReportTypeErrnoInSignal);
-    if (!IsFiredSuppression(ctx, ReportTypeErrnoInSignal, stack)) {
-      rep.AddStack(stack, true);
-      OutputReport(thr, rep);
-    }
-  }
+      errno != 99)
+    ReportErrnoSpoiling(thr, pc);
   errno = saved_errno;
 }
 
-void ProcessPendingSignals(ThreadState *thr) {
+void ProcessPendingSignalsImpl(ThreadState *thr) {
+  atomic_store(&thr->pending_signals, 0, memory_order_relaxed);
   ThreadSignalContext *sctx = SigCtx(thr);
-  if (sctx == 0 ||
-      atomic_load(&sctx->have_pending_signals, memory_order_relaxed) == 0)
+  if (sctx == 0)
     return;
-  atomic_store(&sctx->have_pending_signals, 0, memory_order_relaxed);
   atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
   internal_sigfillset(&sctx->emptyset);
   int res = REAL(pthread_sigmask)(SIG_SETMASK, &sctx->emptyset, &sctx->oldset);
@@ -2009,8 +2042,8 @@ void ProcessPendingSignals(ThreadState *thr) {
     SignalDesc *signal = &sctx->pending_signals[sig];
     if (signal->armed) {
       signal->armed = false;
-      CallUserSignalHandler(thr, false, true, signal->sigaction, sig,
-          &signal->siginfo, &signal->ctx);
+      CallUserSignalHandler(thr, false, true, sig, &signal->siginfo,
+                            &signal->ctx);
     }
   }
   res = REAL(pthread_sigmask)(SIG_SETMASK, &sctx->oldset, 0);
@@ -2027,9 +2060,7 @@ static bool is_sync_signal(ThreadSignalContext *sctx, int sig) {
          (sctx && sig == sctx->int_signal_send);
 }
 
-void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
-                                          __sanitizer_siginfo *info,
-                                          void *ctx) {
+void sighandler(int sig, __sanitizer_siginfo *info, void *ctx) {
   cur_thread_init();
   ThreadState *thr = cur_thread();
   ThreadSignalContext *sctx = SigCtx(thr);
@@ -2047,7 +2078,7 @@ void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
     atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
     if (sctx && atomic_load(&sctx->in_blocking_func, memory_order_relaxed)) {
       atomic_store(&sctx->in_blocking_func, 0, memory_order_relaxed);
-      CallUserSignalHandler(thr, sync, true, sigact, sig, info, ctx);
+      CallUserSignalHandler(thr, sync, true, sig, info, ctx);
       atomic_store(&sctx->in_blocking_func, 1, memory_order_relaxed);
     } else {
       // Be very conservative with when we do acquire in this case.
@@ -2056,7 +2087,7 @@ void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
       // SIGSYS looks relatively safe -- it's synchronous and can actually
       // need some global state.
       bool acq = (sig == SIGSYS);
-      CallUserSignalHandler(thr, sync, acq, sigact, sig, info, ctx);
+      CallUserSignalHandler(thr, sync, acq, sig, info, ctx);
     }
     atomic_fetch_add(&thr->in_signal_handler, -1, memory_order_relaxed);
     return;
@@ -2067,23 +2098,12 @@ void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
   SignalDesc *signal = &sctx->pending_signals[sig];
   if (signal->armed == false) {
     signal->armed = true;
-    signal->sigaction = sigact;
-    if (info)
-      internal_memcpy(&signal->siginfo, info, sizeof(*info));
-    if (ctx)
-      internal_memcpy(&signal->ctx, ctx, sizeof(signal->ctx));
-    atomic_store(&sctx->have_pending_signals, 1, memory_order_relaxed);
+    internal_memcpy(&signal->siginfo, info, sizeof(*info));
+    internal_memcpy(&signal->ctx, ctx, sizeof(signal->ctx));
+    atomic_store(&thr->pending_signals, 1, memory_order_relaxed);
   }
 }
 
-static void rtl_sighandler(int sig) {
-  rtl_generic_sighandler(false, sig, 0, 0);
-}
-
-static void rtl_sigaction(int sig, __sanitizer_siginfo *info, void *ctx) {
-  rtl_generic_sighandler(true, sig, info, ctx);
-}
-
 TSAN_INTERCEPTOR(int, raise, int sig) {
   SCOPED_TSAN_INTERCEPTOR(raise, sig);
   ThreadSignalContext *sctx = SigCtx(thr);
@@ -2142,7 +2162,7 @@ TSAN_INTERCEPTOR(int, getaddrinfo, void *node, void *service,
   // inside of getaddrinfo. So ignore memory accesses.
   ThreadIgnoreBegin(thr, pc);
   int res = REAL(getaddrinfo)(node, service, hints, rv);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
   return res;
 }
 
@@ -2206,7 +2226,7 @@ struct dl_iterate_phdr_data {
 };
 
 static bool IsAppNotRodata(uptr addr) {
-  return IsAppMem(addr) && *(u64*)MemToShadow(addr) != kShadowRodata;
+  return IsAppMem(addr) && *MemToShadow(addr) != kShadowRodata;
 }
 
 static int dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
@@ -2249,7 +2269,6 @@ static int OnExit(ThreadState *thr) {
 
 struct TsanInterceptorContext {
   ThreadState *thr;
-  const uptr caller_pc;
   const uptr pc;
 };
 
@@ -2290,17 +2309,17 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
                     ((TsanInterceptorContext *) ctx)->pc, (uptr) ptr, size, \
                     false)
 
-#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...)      \
-  SCOPED_TSAN_INTERCEPTOR(func, __VA_ARGS__);         \
-  TsanInterceptorContext _ctx = {thr, caller_pc, pc}; \
-  ctx = (void *)&_ctx;                                \
-  (void) ctx;
+#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...) \
+  SCOPED_TSAN_INTERCEPTOR(func, __VA_ARGS__);    \
+  TsanInterceptorContext _ctx = {thr, pc};       \
+  ctx = (void *)&_ctx;                           \
+  (void)ctx;
 
 #define COMMON_INTERCEPTOR_ENTER_NOIGNORE(ctx, func, ...) \
   SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__);              \
-  TsanInterceptorContext _ctx = {thr, caller_pc, pc};     \
+  TsanInterceptorContext _ctx = {thr, pc};                \
   ctx = (void *)&_ctx;                                    \
-  (void) ctx;
+  (void)ctx;
 
 #define COMMON_INTERCEPTOR_FILE_OPEN(ctx, file, path) \
   if (path)                                           \
@@ -2347,7 +2366,7 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
   ThreadSetName(((TsanInterceptorContext *) ctx)->thr, name)
 
 #define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name) \
-  __tsan::ctx->thread_registry->SetThreadNameByUserId(thread, name)
+  __tsan::ctx->thread_registry.SetThreadNameByUserId(thread, name)
 
 #define COMMON_INTERCEPTOR_BLOCK_REAL(name) BLOCK_REAL(name)
 
@@ -2419,9 +2438,13 @@ static __sanitizer_sighandler_ptr signal_impl(int sig,
 int sigaction_impl(int sig, const __sanitizer_sigaction *act,
                    __sanitizer_sigaction *old) {
   // Note: if we call REAL(sigaction) directly for any reason without proxying
-  // the signal handler through rtl_sigaction, very bad things will happen.
+  // the signal handler through sighandler, very bad things will happen.
   // The handler will run synchronously and corrupt tsan per-thread state.
   SCOPED_INTERCEPTOR_RAW(sigaction, sig, act, old);
+  if (sig <= 0 || sig >= kSigCount) {
+    errno = errno_EINVAL;
+    return -1;
+  }
   __sanitizer_sigaction *sigactions = interceptor_ctx()->sigactions;
   __sanitizer_sigaction old_stored;
   if (old) internal_memcpy(&old_stored, &sigactions[sig], sizeof(old_stored));
@@ -2443,22 +2466,17 @@ int sigaction_impl(int sig, const __sanitizer_sigaction *act,
 #endif
     internal_memcpy(&newact, act, sizeof(newact));
     internal_sigfillset(&newact.sa_mask);
-    if ((uptr)act->handler != sig_ign && (uptr)act->handler != sig_dfl) {
-      if (newact.sa_flags & SA_SIGINFO)
-        newact.sigaction = rtl_sigaction;
-      else
-        newact.handler = rtl_sighandler;
+    if ((act->sa_flags & SA_SIGINFO) ||
+        ((uptr)act->handler != sig_ign && (uptr)act->handler != sig_dfl)) {
+      newact.sa_flags |= SA_SIGINFO;
+      newact.sigaction = sighandler;
     }
     ReleaseStore(thr, pc, (uptr)&sigactions[sig]);
     act = &newact;
   }
   int res = REAL(sigaction)(sig, act, old);
-  if (res == 0 && old) {
-    uptr cb = (uptr)old->sigaction;
-    if (cb == (uptr)rtl_sigaction || cb == (uptr)rtl_sighandler) {
-      internal_memcpy(old, &old_stored, sizeof(*old));
-    }
-  }
+  if (res == 0 && old && old->sigaction == sighandler)
+    internal_memcpy(old, &old_stored, sizeof(*old));
   return res;
 }
 
@@ -2474,20 +2492,16 @@ static __sanitizer_sighandler_ptr signal_impl(int sig,
   return old.handler;
 }
 
-#define TSAN_SYSCALL() \
+#define TSAN_SYSCALL()             \
   ThreadState *thr = cur_thread(); \
-  if (thr->ignore_interceptors) \
-    return; \
-  ScopedSyscall scoped_syscall(thr) \
-/**/
+  if (thr->ignore_interceptors)    \
+    return;                        \
+  ScopedSyscall scoped_syscall(thr)
 
 struct ScopedSyscall {
   ThreadState *thr;
 
-  explicit ScopedSyscall(ThreadState *thr)
-      : thr(thr) {
-    Initialize(thr);
-  }
+  explicit ScopedSyscall(ThreadState *thr) : thr(thr) { LazyInitialize(thr); }
 
   ~ScopedSyscall() {
     ProcessPendingSignals(thr);
@@ -2503,12 +2517,12 @@ static void syscall_access_range(uptr pc, uptr p, uptr s, bool write) {
 static USED void syscall_acquire(uptr pc, uptr addr) {
   TSAN_SYSCALL();
   Acquire(thr, pc, addr);
-  DPrintf("syscall_acquire(%p)\n", addr);
+  DPrintf("syscall_acquire(0x%zx))\n", addr);
 }
 
 static USED void syscall_release(uptr pc, uptr addr) {
   TSAN_SYSCALL();
-  DPrintf("syscall_release(%p)\n", addr);
+  DPrintf("syscall_release(0x%zx)\n", addr);
   Release(thr, pc, addr);
 }
 
@@ -2520,12 +2534,12 @@ static void syscall_fd_close(uptr pc, int fd) {
 static USED void syscall_fd_acquire(uptr pc, int fd) {
   TSAN_SYSCALL();
   FdAcquire(thr, pc, fd);
-  DPrintf("syscall_fd_acquire(%p)\n", fd);
+  DPrintf("syscall_fd_acquire(%d)\n", fd);
 }
 
 static USED void syscall_fd_release(uptr pc, int fd) {
   TSAN_SYSCALL();
-  DPrintf("syscall_fd_release(%p)\n", fd);
+  DPrintf("syscall_fd_release(%d)\n", fd);
   FdRelease(thr, pc, fd);
 }
 
@@ -2695,12 +2709,6 @@ void InitializeInterceptors() {
   REAL(memcpy) = internal_memcpy;
 #endif
 
-  // Instruct libc malloc to consume less memory.
-#if SANITIZER_GLIBC
-  mallopt(1, 0);  // M_MXFAST
-  mallopt(-3, 32*1024);  // M_MMAP_THRESHOLD
-#endif
-
   new(interceptor_ctx()) InterceptorContext();
 
   InitializeCommonInterceptors();
@@ -2915,25 +2923,36 @@ void InitializeInterceptors() {
 // Note that no_sanitize_thread attribute does not turn off atomic interception
 // so attaching it to the function defined in user code does not help.
 // That's why we now have what we have.
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_testonly_barrier_init(u64 *barrier, u32 count) {
-  if (count >= (1 << 8)) {
-      Printf("barrier_init: count is too large (%d)\n", count);
-      Die();
+constexpr u32 kBarrierThreadBits = 10;
+constexpr u32 kBarrierThreads = 1 << kBarrierThreadBits;
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __tsan_testonly_barrier_init(
+    atomic_uint32_t *barrier, u32 num_threads) {
+  if (num_threads >= kBarrierThreads) {
+    Printf("barrier_init: count is too large (%d)\n", num_threads);
+    Die();
   }
-  // 8 lsb is thread count, the remaining are count of entered threads.
-  *barrier = count;
+  // kBarrierThreadBits lsb is thread count,
+  // the remaining are count of entered threads.
+  atomic_store(barrier, num_threads, memory_order_relaxed);
 }
 
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_testonly_barrier_wait(u64 *barrier) {
-  unsigned old = __atomic_fetch_add(barrier, 1 << 8, __ATOMIC_RELAXED);
-  unsigned old_epoch = (old >> 8) / (old & 0xff);
+static u32 barrier_epoch(u32 value) {
+  return (value >> kBarrierThreadBits) / (value & (kBarrierThreads - 1));
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __tsan_testonly_barrier_wait(
+    atomic_uint32_t *barrier) {
+  u32 old = atomic_fetch_add(barrier, kBarrierThreads, memory_order_relaxed);
+  u32 old_epoch = barrier_epoch(old);
+  if (barrier_epoch(old + kBarrierThreads) != old_epoch) {
+    FutexWake(barrier, (1 << 30));
+    return;
+  }
   for (;;) {
-    unsigned cur = __atomic_load_n(barrier, __ATOMIC_RELAXED);
-    unsigned cur_epoch = (cur >> 8) / (cur & 0xff);
-    if (cur_epoch != old_epoch)
+    u32 cur = atomic_load(barrier, memory_order_relaxed);
+    if (barrier_epoch(cur) != old_epoch)
       return;
-    internal_sched_yield();
+    FutexWait(barrier, cur);
   }
 }
diff --git a/libsanitizer/tsan/tsan_interface.cpp b/libsanitizer/tsan/tsan_interface.cpp
index 9bd0e85..704c06a 100644
--- a/libsanitizer/tsan/tsan_interface.cpp
+++ b/libsanitizer/tsan/tsan_interface.cpp
@@ -30,99 +30,51 @@ void __tsan_flush_memory() {
 }
 
 void __tsan_read16(void *addr) {
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr, kSizeLog8);
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr + 8, kSizeLog8);
+  uptr pc = CALLERPC;
+  ThreadState *thr = cur_thread();
+  MemoryAccess(thr, pc, (uptr)addr, 8, kAccessRead);
+  MemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessRead);
 }
 
 void __tsan_write16(void *addr) {
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr, kSizeLog8);
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr + 8, kSizeLog8);
+  uptr pc = CALLERPC;
+  ThreadState *thr = cur_thread();
+  MemoryAccess(thr, pc, (uptr)addr, 8, kAccessWrite);
+  MemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessWrite);
 }
 
 void __tsan_read16_pc(void *addr, void *pc) {
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8);
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8);
+  uptr pc_no_pac = STRIP_PAC_PC(pc);
+  ThreadState *thr = cur_thread();
+  MemoryAccess(thr, pc_no_pac, (uptr)addr, 8, kAccessRead);
+  MemoryAccess(thr, pc_no_pac, (uptr)addr + 8, 8, kAccessRead);
 }
 
 void __tsan_write16_pc(void *addr, void *pc) {
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8);
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8);
+  uptr pc_no_pac = STRIP_PAC_PC(pc);
+  ThreadState *thr = cur_thread();
+  MemoryAccess(thr, pc_no_pac, (uptr)addr, 8, kAccessWrite);
+  MemoryAccess(thr, pc_no_pac, (uptr)addr + 8, 8, kAccessWrite);
 }
 
 // __tsan_unaligned_read/write calls are emitted by compiler.
 
-void __tsan_unaligned_read2(const void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, false, false);
-}
-
-void __tsan_unaligned_read4(const void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, false, false);
-}
-
-void __tsan_unaligned_read8(const void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, false, false);
-}
-
 void __tsan_unaligned_read16(const void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 16, false, false);
-}
-
-void __tsan_unaligned_write2(void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, true, false);
-}
-
-void __tsan_unaligned_write4(void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, true, false);
-}
-
-void __tsan_unaligned_write8(void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, true, false);
+  uptr pc = CALLERPC;
+  ThreadState *thr = cur_thread();
+  UnalignedMemoryAccess(thr, pc, (uptr)addr, 8, kAccessRead);
+  UnalignedMemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessRead);
 }
 
 void __tsan_unaligned_write16(void *addr) {
-  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 16, true, false);
+  uptr pc = CALLERPC;
+  ThreadState *thr = cur_thread();
+  UnalignedMemoryAccess(thr, pc, (uptr)addr, 8, kAccessWrite);
+  UnalignedMemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessWrite);
 }
 
-// __sanitizer_unaligned_load/store are for user instrumentation.
-
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE
-u16 __sanitizer_unaligned_load16(const uu16 *addr) {
-  __tsan_unaligned_read2(addr);
-  return *addr;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-u32 __sanitizer_unaligned_load32(const uu32 *addr) {
-  __tsan_unaligned_read4(addr);
-  return *addr;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-u64 __sanitizer_unaligned_load64(const uu64 *addr) {
-  __tsan_unaligned_read8(addr);
-  return *addr;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_unaligned_store16(uu16 *addr, u16 v) {
-  __tsan_unaligned_write2(addr);
-  *addr = v;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_unaligned_store32(uu32 *addr, u32 v) {
-  __tsan_unaligned_write4(addr);
-  *addr = v;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_unaligned_store64(uu64 *addr, u64 v) {
-  __tsan_unaligned_write8(addr);
-  *addr = v;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
 void *__tsan_get_current_fiber() {
   return cur_thread();
 }
diff --git a/libsanitizer/tsan/tsan_interface.h b/libsanitizer/tsan/tsan_interface.h
index 124aa2f..711f064 100644
--- a/libsanitizer/tsan/tsan_interface.h
+++ b/libsanitizer/tsan/tsan_interface.h
@@ -95,9 +95,9 @@ SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_write_range(void *addr, unsigned long size);
 
 SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_read_range_pc(void *addr, unsigned long size, void *pc);  // NOLINT
+void __tsan_read_range_pc(void *addr, unsigned long size, void *pc);
 SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_write_range_pc(void *addr, unsigned long size, void *pc);  // NOLINT
+void __tsan_write_range_pc(void *addr, unsigned long size, void *pc);
 
 // User may provide function that would be called right when TSan detects
 // an error. The argument 'report' is an opaque pointer that can be used to
@@ -417,12 +417,6 @@ SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_go_atomic64_compare_exchange(ThreadState *thr, uptr cpc, uptr pc,
                                          u8 *a);
 
-SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_on_initialize();
-
-SANITIZER_INTERFACE_ATTRIBUTE
-int __tsan_on_finalize(int failed);
-
 }  // extern "C"
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_interface.inc b/libsanitizer/tsan/tsan_interface.inc
new file mode 100644
index 0000000..0031800
--- /dev/null
+++ b/libsanitizer/tsan/tsan_interface.inc
@@ -0,0 +1,182 @@
+//===-- tsan_interface.inc --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_ptrauth.h"
+#include "tsan_interface.h"
+#include "tsan_rtl.h"
+
+#define CALLERPC ((uptr)__builtin_return_address(0))
+
+using namespace __tsan;
+
+void __tsan_read1(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 1, kAccessRead);
+}
+
+void __tsan_read2(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, kAccessRead);
+}
+
+void __tsan_read4(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, kAccessRead);
+}
+
+void __tsan_read8(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessRead);
+}
+
+void __tsan_write1(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 1, kAccessWrite);
+}
+
+void __tsan_write2(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, kAccessWrite);
+}
+
+void __tsan_write4(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, kAccessWrite);
+}
+
+void __tsan_write8(void *addr) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessWrite);
+}
+
+void __tsan_read1_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 1, kAccessRead | kAccessExternalPC);
+}
+
+void __tsan_read2_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 2, kAccessRead | kAccessExternalPC);
+}
+
+void __tsan_read4_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 4, kAccessRead | kAccessExternalPC);
+}
+
+void __tsan_read8_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 8, kAccessRead | kAccessExternalPC);
+}
+
+void __tsan_write1_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 1, kAccessWrite | kAccessExternalPC);
+}
+
+void __tsan_write2_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 2, kAccessWrite | kAccessExternalPC);
+}
+
+void __tsan_write4_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 4, kAccessWrite | kAccessExternalPC);
+}
+
+void __tsan_write8_pc(void *addr, void *pc) {
+  MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 8, kAccessWrite | kAccessExternalPC);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_read2(const void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, kAccessRead);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_read4(const void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, kAccessRead);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_read8(const void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessRead);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_write2(void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 2, kAccessWrite);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_write4(void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 4, kAccessWrite);
+}
+
+ALWAYS_INLINE USED void __tsan_unaligned_write8(void *addr) {
+  UnalignedMemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessWrite);
+}
+
+extern "C" {
+// __sanitizer_unaligned_load/store are for user instrumentation.
+SANITIZER_INTERFACE_ATTRIBUTE
+u16 __sanitizer_unaligned_load16(const uu16 *addr) {
+  __tsan_unaligned_read2(addr);
+  return *addr;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+u32 __sanitizer_unaligned_load32(const uu32 *addr) {
+  __tsan_unaligned_read4(addr);
+  return *addr;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+u64 __sanitizer_unaligned_load64(const uu64 *addr) {
+  __tsan_unaligned_read8(addr);
+  return *addr;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_unaligned_store16(uu16 *addr, u16 v) {
+  *addr = v;
+  __tsan_unaligned_write2(addr);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_unaligned_store32(uu32 *addr, u32 v) {
+  *addr = v;
+  __tsan_unaligned_write4(addr);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_unaligned_store64(uu64 *addr, u64 v) {
+  *addr = v;
+  __tsan_unaligned_write8(addr);
+}
+}
+
+void __tsan_vptr_update(void **vptr_p, void *new_val) {
+  if (*vptr_p == new_val)
+    return;
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)vptr_p, sizeof(*vptr_p),
+               kAccessWrite | kAccessVptr);
+}
+
+void __tsan_vptr_read(void **vptr_p) {
+  MemoryAccess(cur_thread(), CALLERPC, (uptr)vptr_p, sizeof(*vptr_p),
+               kAccessRead | kAccessVptr);
+}
+
+void __tsan_func_entry(void *pc) { FuncEntry(cur_thread(), STRIP_PAC_PC(pc)); }
+
+void __tsan_func_exit() { FuncExit(cur_thread()); }
+
+void __tsan_ignore_thread_begin() { ThreadIgnoreBegin(cur_thread(), CALLERPC); }
+
+void __tsan_ignore_thread_end() { ThreadIgnoreEnd(cur_thread()); }
+
+void __tsan_read_range(void *addr, uptr size) {
+  MemoryAccessRange(cur_thread(), CALLERPC, (uptr)addr, size, false);
+}
+
+void __tsan_write_range(void *addr, uptr size) {
+  MemoryAccessRange(cur_thread(), CALLERPC, (uptr)addr, size, true);
+}
+
+void __tsan_read_range_pc(void *addr, uptr size, void *pc) {
+  MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, false);
+}
+
+void __tsan_write_range_pc(void *addr, uptr size, void *pc) {
+  MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, true);
+}
diff --git a/libsanitizer/tsan/tsan_interface_ann.cpp b/libsanitizer/tsan/tsan_interface_ann.cpp
index 175855f..6bd72e1 100644
--- a/libsanitizer/tsan/tsan_interface_ann.cpp
+++ b/libsanitizer/tsan/tsan_interface_ann.cpp
@@ -15,7 +15,6 @@
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_vector.h"
 #include "tsan_interface_ann.h"
-#include "tsan_mutex.h"
 #include "tsan_report.h"
 #include "tsan_rtl.h"
 #include "tsan_mman.h"
@@ -38,21 +37,20 @@ class ScopedAnnotation {
 
   ~ScopedAnnotation() {
     FuncExit(thr_);
-    CheckNoLocks(thr_);
+    CheckedMutex::CheckNoLocks();
   }
  private:
   ThreadState *const thr_;
 };
 
-#define SCOPED_ANNOTATION_RET(typ, ret) \
-    if (!flags()->enable_annotations) \
-      return ret; \
-    ThreadState *thr = cur_thread(); \
-    const uptr caller_pc = (uptr)__builtin_return_address(0); \
-    ScopedAnnotation sa(thr, __func__, caller_pc); \
-    const uptr pc = StackTrace::GetCurrentPc(); \
-    (void)pc; \
-/**/
+#define SCOPED_ANNOTATION_RET(typ, ret)                     \
+  if (!flags()->enable_annotations)                         \
+    return ret;                                             \
+  ThreadState *thr = cur_thread();                          \
+  const uptr caller_pc = (uptr)__builtin_return_address(0); \
+  ScopedAnnotation sa(thr, __func__, caller_pc);            \
+  const uptr pc = StackTrace::GetCurrentPc();               \
+  (void)pc;
 
 #define SCOPED_ANNOTATION(typ) SCOPED_ANNOTATION_RET(typ, )
 
@@ -72,7 +70,6 @@ struct ExpectRace {
 
 struct DynamicAnnContext {
   Mutex mtx;
-  ExpectRace expect;
   ExpectRace benign;
 
   DynamicAnnContext() : mtx(MutexTypeAnnotations) {}
@@ -91,7 +88,7 @@ static void AddExpectRace(ExpectRace *list,
       return;
     }
   }
-  race = (ExpectRace*)internal_alloc(MBlockExpectRace, sizeof(ExpectRace));
+  race = static_cast<ExpectRace *>(Alloc(sizeof(ExpectRace)));
   race->addr = addr;
   race->size = size;
   race->file = f;
@@ -138,81 +135,12 @@ static void InitList(ExpectRace *list) {
 
 void InitializeDynamicAnnotations() {
   dyn_ann_ctx = new(dyn_ann_ctx_placeholder) DynamicAnnContext;
-  InitList(&dyn_ann_ctx->expect);
   InitList(&dyn_ann_ctx->benign);
 }
 
 bool IsExpectedReport(uptr addr, uptr size) {
   ReadLock lock(&dyn_ann_ctx->mtx);
-  if (CheckContains(&dyn_ann_ctx->expect, addr, size))
-    return true;
-  if (CheckContains(&dyn_ann_ctx->benign, addr, size))
-    return true;
-  return false;
-}
-
-static void CollectMatchedBenignRaces(Vector<ExpectRace> *matched,
-    int *unique_count, int *hit_count, atomic_uintptr_t ExpectRace::*counter) {
-  ExpectRace *list = &dyn_ann_ctx->benign;
-  for (ExpectRace *race = list->next; race != list; race = race->next) {
-    (*unique_count)++;
-    const uptr cnt = atomic_load_relaxed(&(race->*counter));
-    if (cnt == 0)
-      continue;
-    *hit_count += cnt;
-    uptr i = 0;
-    for (; i < matched->Size(); i++) {
-      ExpectRace *race0 = &(*matched)[i];
-      if (race->line == race0->line
-          && internal_strcmp(race->file, race0->file) == 0
-          && internal_strcmp(race->desc, race0->desc) == 0) {
-        atomic_fetch_add(&(race0->*counter), cnt, memory_order_relaxed);
-        break;
-      }
-    }
-    if (i == matched->Size())
-      matched->PushBack(*race);
-  }
-}
-
-void PrintMatchedBenignRaces() {
-  Lock lock(&dyn_ann_ctx->mtx);
-  int unique_count = 0;
-  int hit_count = 0;
-  int add_count = 0;
-  Vector<ExpectRace> hit_matched;
-  CollectMatchedBenignRaces(&hit_matched, &unique_count, &hit_count,
-      &ExpectRace::hitcount);
-  Vector<ExpectRace> add_matched;
-  CollectMatchedBenignRaces(&add_matched, &unique_count, &add_count,
-      &ExpectRace::addcount);
-  if (hit_matched.Size()) {
-    Printf("ThreadSanitizer: Matched %d \"benign\" races (pid=%d):\n",
-        hit_count, (int)internal_getpid());
-    for (uptr i = 0; i < hit_matched.Size(); i++) {
-      Printf("%d %s:%d %s\n",
-          atomic_load_relaxed(&hit_matched[i].hitcount),
-          hit_matched[i].file, hit_matched[i].line, hit_matched[i].desc);
-    }
-  }
-  if (hit_matched.Size()) {
-    Printf("ThreadSanitizer: Annotated %d \"benign\" races, %d unique"
-           " (pid=%d):\n",
-        add_count, unique_count, (int)internal_getpid());
-    for (uptr i = 0; i < add_matched.Size(); i++) {
-      Printf("%d %s:%d %s\n",
-          atomic_load_relaxed(&add_matched[i].addcount),
-          add_matched[i].file, add_matched[i].line, add_matched[i].desc);
-    }
-  }
-}
-
-static void ReportMissedExpectedRace(ExpectRace *race) {
-  Printf("==================\n");
-  Printf("WARNING: ThreadSanitizer: missed expected data race\n");
-  Printf("  %s addr=%zx %s:%d\n",
-      race->desc, race->addr, race->file, race->line);
-  Printf("==================\n");
+  return CheckContains(&dyn_ann_ctx->benign, addr, size);
 }
 }  // namespace __tsan
 
@@ -230,20 +158,16 @@ void INTERFACE_ATTRIBUTE AnnotateHappensAfter(char *f, int l, uptr addr) {
 }
 
 void INTERFACE_ATTRIBUTE AnnotateCondVarSignal(char *f, int l, uptr cv) {
-  SCOPED_ANNOTATION(AnnotateCondVarSignal);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateCondVarSignalAll(char *f, int l, uptr cv) {
-  SCOPED_ANNOTATION(AnnotateCondVarSignalAll);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateMutexIsNotPHB(char *f, int l, uptr mu) {
-  SCOPED_ANNOTATION(AnnotateMutexIsNotPHB);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateCondVarWait(char *f, int l, uptr cv,
                                              uptr lock) {
-  SCOPED_ANNOTATION(AnnotateCondVarWait);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateRWLockCreate(char *f, int l, uptr m) {
@@ -280,86 +204,56 @@ void INTERFACE_ATTRIBUTE AnnotateRWLockReleased(char *f, int l, uptr m,
 }
 
 void INTERFACE_ATTRIBUTE AnnotateTraceMemory(char *f, int l, uptr mem) {
-  SCOPED_ANNOTATION(AnnotateTraceMemory);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateFlushState(char *f, int l) {
-  SCOPED_ANNOTATION(AnnotateFlushState);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateNewMemory(char *f, int l, uptr mem,
                                            uptr size) {
-  SCOPED_ANNOTATION(AnnotateNewMemory);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateNoOp(char *f, int l, uptr mem) {
-  SCOPED_ANNOTATION(AnnotateNoOp);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateFlushExpectedRaces(char *f, int l) {
-  SCOPED_ANNOTATION(AnnotateFlushExpectedRaces);
-  Lock lock(&dyn_ann_ctx->mtx);
-  while (dyn_ann_ctx->expect.next != &dyn_ann_ctx->expect) {
-    ExpectRace *race = dyn_ann_ctx->expect.next;
-    if (atomic_load_relaxed(&race->hitcount) == 0) {
-      ctx->nmissed_expected++;
-      ReportMissedExpectedRace(race);
-    }
-    race->prev->next = race->next;
-    race->next->prev = race->prev;
-    internal_free(race);
-  }
 }
 
 void INTERFACE_ATTRIBUTE AnnotateEnableRaceDetection(
     char *f, int l, int enable) {
-  SCOPED_ANNOTATION(AnnotateEnableRaceDetection);
-  // FIXME: Reconsider this functionality later. It may be irrelevant.
 }
 
 void INTERFACE_ATTRIBUTE AnnotateMutexIsUsedAsCondVar(
     char *f, int l, uptr mu) {
-  SCOPED_ANNOTATION(AnnotateMutexIsUsedAsCondVar);
 }
 
 void INTERFACE_ATTRIBUTE AnnotatePCQGet(
     char *f, int l, uptr pcq) {
-  SCOPED_ANNOTATION(AnnotatePCQGet);
 }
 
 void INTERFACE_ATTRIBUTE AnnotatePCQPut(
     char *f, int l, uptr pcq) {
-  SCOPED_ANNOTATION(AnnotatePCQPut);
 }
 
 void INTERFACE_ATTRIBUTE AnnotatePCQDestroy(
     char *f, int l, uptr pcq) {
-  SCOPED_ANNOTATION(AnnotatePCQDestroy);
 }
 
 void INTERFACE_ATTRIBUTE AnnotatePCQCreate(
     char *f, int l, uptr pcq) {
-  SCOPED_ANNOTATION(AnnotatePCQCreate);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateExpectRace(
     char *f, int l, uptr mem, char *desc) {
-  SCOPED_ANNOTATION(AnnotateExpectRace);
-  Lock lock(&dyn_ann_ctx->mtx);
-  AddExpectRace(&dyn_ann_ctx->expect,
-                f, l, mem, 1, desc);
-  DPrintf("Add expected race: %s addr=%zx %s:%d\n", desc, mem, f, l);
 }
 
-static void BenignRaceImpl(
-    char *f, int l, uptr mem, uptr size, char *desc) {
+static void BenignRaceImpl(char *f, int l, uptr mem, uptr size, char *desc) {
   Lock lock(&dyn_ann_ctx->mtx);
   AddExpectRace(&dyn_ann_ctx->benign,
                 f, l, mem, size, desc);
   DPrintf("Add benign race: %s addr=%zx %s:%d\n", desc, mem, f, l);
 }
 
-// FIXME: Turn it off later. WTF is benign race?1?? Go talk to Hans Boehm.
 void INTERFACE_ATTRIBUTE AnnotateBenignRaceSized(
     char *f, int l, uptr mem, uptr size, char *desc) {
   SCOPED_ANNOTATION(AnnotateBenignRaceSized);
@@ -379,7 +273,7 @@ void INTERFACE_ATTRIBUTE AnnotateIgnoreReadsBegin(char *f, int l) {
 
 void INTERFACE_ATTRIBUTE AnnotateIgnoreReadsEnd(char *f, int l) {
   SCOPED_ANNOTATION(AnnotateIgnoreReadsEnd);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateIgnoreWritesBegin(char *f, int l) {
@@ -389,7 +283,7 @@ void INTERFACE_ATTRIBUTE AnnotateIgnoreWritesBegin(char *f, int l) {
 
 void INTERFACE_ATTRIBUTE AnnotateIgnoreWritesEnd(char *f, int l) {
   SCOPED_ANNOTATION(AnnotateIgnoreWritesEnd);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreEnd(thr);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateIgnoreSyncBegin(char *f, int l) {
@@ -399,17 +293,15 @@ void INTERFACE_ATTRIBUTE AnnotateIgnoreSyncBegin(char *f, int l) {
 
 void INTERFACE_ATTRIBUTE AnnotateIgnoreSyncEnd(char *f, int l) {
   SCOPED_ANNOTATION(AnnotateIgnoreSyncEnd);
-  ThreadIgnoreSyncEnd(thr, pc);
+  ThreadIgnoreSyncEnd(thr);
 }
 
 void INTERFACE_ATTRIBUTE AnnotatePublishMemoryRange(
     char *f, int l, uptr addr, uptr size) {
-  SCOPED_ANNOTATION(AnnotatePublishMemoryRange);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateUnpublishMemoryRange(
     char *f, int l, uptr addr, uptr size) {
-  SCOPED_ANNOTATION(AnnotateUnpublishMemoryRange);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateThreadName(
@@ -422,11 +314,9 @@ void INTERFACE_ATTRIBUTE AnnotateThreadName(
 // WTFAnnotateHappensAfter(). Those are being used by Webkit to annotate
 // atomic operations, which should be handled by ThreadSanitizer correctly.
 void INTERFACE_ATTRIBUTE WTFAnnotateHappensBefore(char *f, int l, uptr addr) {
-  SCOPED_ANNOTATION(AnnotateHappensBefore);
 }
 
 void INTERFACE_ATTRIBUTE WTFAnnotateHappensAfter(char *f, int l, uptr addr) {
-  SCOPED_ANNOTATION(AnnotateHappensAfter);
 }
 
 void INTERFACE_ATTRIBUTE WTFAnnotateBenignRaceSized(
@@ -478,15 +368,15 @@ void __tsan_mutex_pre_lock(void *m, unsigned flagz) {
     else
       MutexPreLock(thr, pc, (uptr)m);
   }
-  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
-  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreBegin(thr, 0);
+  ThreadIgnoreSyncBegin(thr, 0);
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_post_lock(void *m, unsigned flagz, int rec) {
   SCOPED_ANNOTATION(__tsan_mutex_post_lock);
-  ThreadIgnoreSyncEnd(thr, pc);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreSyncEnd(thr);
+  ThreadIgnoreEnd(thr);
   if (!(flagz & MutexFlagTryLockFailed)) {
     if (flagz & MutexFlagReadLock)
       MutexPostReadLock(thr, pc, (uptr)m, flagz);
@@ -505,44 +395,44 @@ int __tsan_mutex_pre_unlock(void *m, unsigned flagz) {
   } else {
     ret = MutexUnlock(thr, pc, (uptr)m, flagz);
   }
-  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
-  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreBegin(thr, 0);
+  ThreadIgnoreSyncBegin(thr, 0);
   return ret;
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_post_unlock(void *m, unsigned flagz) {
   SCOPED_ANNOTATION(__tsan_mutex_post_unlock);
-  ThreadIgnoreSyncEnd(thr, pc);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreSyncEnd(thr);
+  ThreadIgnoreEnd(thr);
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_pre_signal(void *addr, unsigned flagz) {
   SCOPED_ANNOTATION(__tsan_mutex_pre_signal);
-  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
-  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreBegin(thr, 0);
+  ThreadIgnoreSyncBegin(thr, 0);
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_post_signal(void *addr, unsigned flagz) {
   SCOPED_ANNOTATION(__tsan_mutex_post_signal);
-  ThreadIgnoreSyncEnd(thr, pc);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreSyncEnd(thr);
+  ThreadIgnoreEnd(thr);
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_pre_divert(void *addr, unsigned flagz) {
   SCOPED_ANNOTATION(__tsan_mutex_pre_divert);
   // Exit from ignore region started in __tsan_mutex_pre_lock/unlock/signal.
-  ThreadIgnoreSyncEnd(thr, pc);
-  ThreadIgnoreEnd(thr, pc);
+  ThreadIgnoreSyncEnd(thr);
+  ThreadIgnoreEnd(thr);
 }
 
 INTERFACE_ATTRIBUTE
 void __tsan_mutex_post_divert(void *addr, unsigned flagz) {
   SCOPED_ANNOTATION(__tsan_mutex_post_divert);
-  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
-  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreBegin(thr, 0);
+  ThreadIgnoreSyncBegin(thr, 0);
 }
 }  // extern "C"
diff --git a/libsanitizer/tsan/tsan_interface_atomic.cpp b/libsanitizer/tsan/tsan_interface_atomic.cpp
index 21fe4a1..24ba3bb 100644
--- a/libsanitizer/tsan/tsan_interface_atomic.cpp
+++ b/libsanitizer/tsan/tsan_interface_atomic.cpp
@@ -32,6 +32,7 @@ using namespace __tsan;
 static StaticSpinMutex mutex128;
 #endif
 
+#if SANITIZER_DEBUG
 static bool IsLoadOrder(morder mo) {
   return mo == mo_relaxed || mo == mo_consume
       || mo == mo_acquire || mo == mo_seq_cst;
@@ -40,6 +41,7 @@ static bool IsLoadOrder(morder mo) {
 static bool IsStoreOrder(morder mo) {
   return mo == mo_relaxed || mo == mo_release || mo == mo_seq_cst;
 }
+#endif
 
 static bool IsReleaseOrder(morder mo) {
   return mo == mo_release || mo == mo_acq_rel || mo == mo_seq_cst;
@@ -161,16 +163,16 @@ a128 func_cas(volatile a128 *v, a128 cmp, a128 xch) {
 }
 #endif
 
-template<typename T>
-static int SizeLog() {
+template <typename T>
+static int AccessSize() {
   if (sizeof(T) <= 1)
-    return kSizeLog1;
+    return 1;
   else if (sizeof(T) <= 2)
-    return kSizeLog2;
+    return 2;
   else if (sizeof(T) <= 4)
-    return kSizeLog4;
+    return 4;
   else
-    return kSizeLog8;
+    return 8;
   // For 16-byte atomics we also use 8-byte memory access,
   // this leads to false negatives only in very obscure cases.
 }
@@ -202,7 +204,7 @@ static memory_order to_mo(morder mo) {
   case mo_acq_rel: return memory_order_acq_rel;
   case mo_seq_cst: return memory_order_seq_cst;
   }
-  CHECK(0);
+  DCHECK(0);
   return memory_order_seq_cst;
 }
 
@@ -218,27 +220,28 @@ static a128 NoTsanAtomicLoad(const volatile a128 *a, morder mo) {
 }
 #endif
 
-template<typename T>
+template <typename T>
 static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a, morder mo) {
-  CHECK(IsLoadOrder(mo));
+  DCHECK(IsLoadOrder(mo));
   // This fast-path is critical for performance.
   // Assume the access is atomic.
   if (!IsAcquireOrder(mo)) {
-    MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
+    MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(),
+                 kAccessRead | kAccessAtomic);
     return NoTsanAtomicLoad(a, mo);
   }
   // Don't create sync object if it does not exist yet. For example, an atomic
   // pointer is initialized to nullptr and then periodically acquire-loaded.
   T v = NoTsanAtomicLoad(a, mo);
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock((uptr)a, false);
+  SyncVar *s = ctx->metamap.GetSyncIfExists((uptr)a);
   if (s) {
+    ReadLock l(&s->mtx);
     AcquireImpl(thr, pc, &s->clock);
     // Re-read under sync mutex because we need a consistent snapshot
     // of the value and the clock we acquire.
     v = NoTsanAtomicLoad(a, mo);
-    s->mtx.ReadUnlock();
   }
-  MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
+  MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessRead | kAccessAtomic);
   return v;
 }
 
@@ -254,11 +257,11 @@ static void NoTsanAtomicStore(volatile a128 *a, a128 v, morder mo) {
 }
 #endif
 
-template<typename T>
+template <typename T>
 static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
-    morder mo) {
-  CHECK(IsStoreOrder(mo));
-  MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
+                        morder mo) {
+  DCHECK(IsStoreOrder(mo));
+  MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
   // This fast-path is critical for performance.
   // Assume the access is atomic.
   // Strictly saying even relaxed store cuts off release sequence,
@@ -268,35 +271,32 @@ static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
     return;
   }
   __sync_synchronize();
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+  Lock l(&s->mtx);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
   ReleaseStoreImpl(thr, pc, &s->clock);
   NoTsanAtomicStore(a, v, mo);
-  s->mtx.Unlock();
 }
 
-template<typename T, T (*F)(volatile T *v, T op)>
+template <typename T, T (*F)(volatile T *v, T op)>
 static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
-  MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
-  SyncVar *s = 0;
-  if (mo != mo_relaxed) {
-    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    if (IsAcqRelOrder(mo))
-      AcquireReleaseImpl(thr, pc, &s->clock);
-    else if (IsReleaseOrder(mo))
-      ReleaseImpl(thr, pc, &s->clock);
-    else if (IsAcquireOrder(mo))
-      AcquireImpl(thr, pc, &s->clock);
-  }
-  v = F(a, v);
-  if (s)
-    s->mtx.Unlock();
-  return v;
+  MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
+  if (LIKELY(mo == mo_relaxed))
+    return F(a, v);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+  Lock l(&s->mtx);
+  thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+  if (IsAcqRelOrder(mo))
+    AcquireReleaseImpl(thr, pc, &s->clock);
+  else if (IsReleaseOrder(mo))
+    ReleaseImpl(thr, pc, &s->clock);
+  else if (IsAcquireOrder(mo))
+    AcquireImpl(thr, pc, &s->clock);
+  return F(a, v);
 }
 
 template<typename T>
@@ -399,21 +399,27 @@ static T NoTsanAtomicCAS(volatile T *a, T c, T v, morder mo, morder fmo) {
   return c;
 }
 
-template<typename T>
-static bool AtomicCAS(ThreadState *thr, uptr pc,
-    volatile T *a, T *c, T v, morder mo, morder fmo) {
+template <typename T>
+static bool AtomicCAS(ThreadState *thr, uptr pc, volatile T *a, T *c, T v,
+                      morder mo, morder fmo) {
   // 31.7.2.18: "The failure argument shall not be memory_order_release
   // nor memory_order_acq_rel". LLVM (2021-05) fallbacks to Monotonic
   // (mo_relaxed) when those are used.
-  CHECK(IsLoadOrder(fmo));
-
-  MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
-  SyncVar *s = 0;
-  bool write_lock = IsReleaseOrder(mo);
-
-  if (mo != mo_relaxed || fmo != mo_relaxed)
-    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
+  DCHECK(IsLoadOrder(fmo));
+
+  MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
+  if (LIKELY(mo == mo_relaxed && fmo == mo_relaxed)) {
+    T cc = *c;
+    T pr = func_cas(a, cc, v);
+    if (pr == cc)
+      return true;
+    *c = pr;
+    return false;
+  }
 
+  bool release = IsReleaseOrder(mo);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+  RWLock l(&s->mtx, release);
   T cc = *c;
   T pr = func_cas(a, cc, v);
   bool success = pr == cc;
@@ -421,25 +427,16 @@ static bool AtomicCAS(ThreadState *thr, uptr pc,
     *c = pr;
     mo = fmo;
   }
+  thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 
-  if (s) {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-    if (success && IsAcqRelOrder(mo))
-      AcquireReleaseImpl(thr, pc, &s->clock);
-    else if (success && IsReleaseOrder(mo))
-      ReleaseImpl(thr, pc, &s->clock);
-    else if (IsAcquireOrder(mo))
-      AcquireImpl(thr, pc, &s->clock);
-
-    if (write_lock)
-      s->mtx.Unlock();
-    else
-      s->mtx.ReadUnlock();
-  }
-
+  if (success && IsAcqRelOrder(mo))
+    AcquireReleaseImpl(thr, pc, &s->clock);
+  else if (success && IsReleaseOrder(mo))
+    ReleaseImpl(thr, pc, &s->clock);
+  else if (IsAcquireOrder(mo))
+    AcquireImpl(thr, pc, &s->clock);
   return success;
 }
 
@@ -483,380 +480,356 @@ static morder convert_morder(morder mo) {
   return (morder)(mo & 0x7fff);
 }
 
-#define SCOPED_ATOMIC(func, ...) \
-    ThreadState *const thr = cur_thread(); \
-    if (UNLIKELY(thr->ignore_sync || thr->ignore_interceptors)) { \
-      ProcessPendingSignals(thr); \
-      return NoTsanAtomic##func(__VA_ARGS__); \
-    } \
-    const uptr callpc = (uptr)__builtin_return_address(0); \
-    uptr pc = StackTrace::GetCurrentPc(); \
-    mo = convert_morder(mo); \
-    ScopedAtomic sa(thr, callpc, a, mo, __func__); \
-    return Atomic##func(thr, pc, __VA_ARGS__); \
-/**/
-
-class ScopedAtomic {
- public:
-  ScopedAtomic(ThreadState *thr, uptr pc, const volatile void *a,
-               morder mo, const char *func)
-      : thr_(thr) {
-    FuncEntry(thr_, pc);
-    DPrintf("#%d: %s(%p, %d)\n", thr_->tid, func, a, mo);
-  }
-  ~ScopedAtomic() {
-    ProcessPendingSignals(thr_);
-    FuncExit(thr_);
-  }
- private:
-  ThreadState *thr_;
-};
+#  define ATOMIC_IMPL(func, ...)                                \
+    ThreadState *const thr = cur_thread();                      \
+    ProcessPendingSignals(thr);                                 \
+    if (UNLIKELY(thr->ignore_sync || thr->ignore_interceptors)) \
+      return NoTsanAtomic##func(__VA_ARGS__);                   \
+    mo = convert_morder(mo);                                    \
+    return Atomic##func(thr, GET_CALLER_PC(), __VA_ARGS__);
 
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_load(const volatile a8 *a, morder mo) {
-  SCOPED_ATOMIC(Load, a, mo);
+  ATOMIC_IMPL(Load, a, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_load(const volatile a16 *a, morder mo) {
-  SCOPED_ATOMIC(Load, a, mo);
+  ATOMIC_IMPL(Load, a, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_load(const volatile a32 *a, morder mo) {
-  SCOPED_ATOMIC(Load, a, mo);
+  ATOMIC_IMPL(Load, a, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_load(const volatile a64 *a, morder mo) {
-  SCOPED_ATOMIC(Load, a, mo);
+  ATOMIC_IMPL(Load, a, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_load(const volatile a128 *a, morder mo) {
-  SCOPED_ATOMIC(Load, a, mo);
+  ATOMIC_IMPL(Load, a, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic8_store(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(Store, a, v, mo);
+  ATOMIC_IMPL(Store, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic16_store(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(Store, a, v, mo);
+  ATOMIC_IMPL(Store, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic32_store(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(Store, a, v, mo);
+  ATOMIC_IMPL(Store, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic64_store(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(Store, a, v, mo);
+  ATOMIC_IMPL(Store, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic128_store(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(Store, a, v, mo);
+  ATOMIC_IMPL(Store, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_exchange(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(Exchange, a, v, mo);
+  ATOMIC_IMPL(Exchange, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_exchange(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(Exchange, a, v, mo);
+  ATOMIC_IMPL(Exchange, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_exchange(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(Exchange, a, v, mo);
+  ATOMIC_IMPL(Exchange, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_exchange(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(Exchange, a, v, mo);
+  ATOMIC_IMPL(Exchange, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_exchange(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(Exchange, a, v, mo);
+  ATOMIC_IMPL(Exchange, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_add(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchAdd, a, v, mo);
+  ATOMIC_IMPL(FetchAdd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_add(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchAdd, a, v, mo);
+  ATOMIC_IMPL(FetchAdd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_add(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchAdd, a, v, mo);
+  ATOMIC_IMPL(FetchAdd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_add(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchAdd, a, v, mo);
+  ATOMIC_IMPL(FetchAdd, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_add(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchAdd, a, v, mo);
+  ATOMIC_IMPL(FetchAdd, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_sub(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchSub, a, v, mo);
+  ATOMIC_IMPL(FetchSub, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_sub(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchSub, a, v, mo);
+  ATOMIC_IMPL(FetchSub, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_sub(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchSub, a, v, mo);
+  ATOMIC_IMPL(FetchSub, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_sub(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchSub, a, v, mo);
+  ATOMIC_IMPL(FetchSub, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_sub(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchSub, a, v, mo);
+  ATOMIC_IMPL(FetchSub, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_and(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchAnd, a, v, mo);
+  ATOMIC_IMPL(FetchAnd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_and(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchAnd, a, v, mo);
+  ATOMIC_IMPL(FetchAnd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_and(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchAnd, a, v, mo);
+  ATOMIC_IMPL(FetchAnd, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_and(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchAnd, a, v, mo);
+  ATOMIC_IMPL(FetchAnd, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_and(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchAnd, a, v, mo);
+  ATOMIC_IMPL(FetchAnd, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_or(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchOr, a, v, mo);
+  ATOMIC_IMPL(FetchOr, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_or(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchOr, a, v, mo);
+  ATOMIC_IMPL(FetchOr, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_or(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchOr, a, v, mo);
+  ATOMIC_IMPL(FetchOr, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_or(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchOr, a, v, mo);
+  ATOMIC_IMPL(FetchOr, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_or(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchOr, a, v, mo);
+  ATOMIC_IMPL(FetchOr, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_xor(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchXor, a, v, mo);
+  ATOMIC_IMPL(FetchXor, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_xor(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchXor, a, v, mo);
+  ATOMIC_IMPL(FetchXor, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_xor(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchXor, a, v, mo);
+  ATOMIC_IMPL(FetchXor, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_xor(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchXor, a, v, mo);
+  ATOMIC_IMPL(FetchXor, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_xor(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchXor, a, v, mo);
+  ATOMIC_IMPL(FetchXor, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_fetch_nand(volatile a8 *a, a8 v, morder mo) {
-  SCOPED_ATOMIC(FetchNand, a, v, mo);
+  ATOMIC_IMPL(FetchNand, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_fetch_nand(volatile a16 *a, a16 v, morder mo) {
-  SCOPED_ATOMIC(FetchNand, a, v, mo);
+  ATOMIC_IMPL(FetchNand, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_fetch_nand(volatile a32 *a, a32 v, morder mo) {
-  SCOPED_ATOMIC(FetchNand, a, v, mo);
+  ATOMIC_IMPL(FetchNand, a, v, mo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_fetch_nand(volatile a64 *a, a64 v, morder mo) {
-  SCOPED_ATOMIC(FetchNand, a, v, mo);
+  ATOMIC_IMPL(FetchNand, a, v, mo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_fetch_nand(volatile a128 *a, a128 v, morder mo) {
-  SCOPED_ATOMIC(FetchNand, a, v, mo);
+  ATOMIC_IMPL(FetchNand, a, v, mo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic8_compare_exchange_strong(volatile a8 *a, a8 *c, a8 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic16_compare_exchange_strong(volatile a16 *a, a16 *c, a16 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic32_compare_exchange_strong(volatile a32 *a, a32 *c, a32 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic64_compare_exchange_strong(volatile a64 *a, a64 *c, a64 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic128_compare_exchange_strong(volatile a128 *a, a128 *c, a128 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic8_compare_exchange_weak(volatile a8 *a, a8 *c, a8 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic16_compare_exchange_weak(volatile a16 *a, a16 *c, a16 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic32_compare_exchange_weak(volatile a32 *a, a32 *c, a32 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic64_compare_exchange_weak(volatile a64 *a, a64 *c, a64 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_atomic128_compare_exchange_weak(volatile a128 *a, a128 *c, a128 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a8 __tsan_atomic8_compare_exchange_val(volatile a8 *a, a8 c, a8 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a16 __tsan_atomic16_compare_exchange_val(volatile a16 *a, a16 c, a16 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a32 __tsan_atomic32_compare_exchange_val(volatile a32 *a, a32 c, a32 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 a64 __tsan_atomic64_compare_exchange_val(volatile a64 *a, a64 c, a64 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 
 #if __TSAN_HAS_INT128
 SANITIZER_INTERFACE_ATTRIBUTE
 a128 __tsan_atomic128_compare_exchange_val(volatile a128 *a, a128 c, a128 v,
     morder mo, morder fmo) {
-  SCOPED_ATOMIC(CAS, a, c, v, mo, fmo);
+  ATOMIC_IMPL(CAS, a, c, v, mo, fmo);
 }
 #endif
 
 SANITIZER_INTERFACE_ATTRIBUTE
-void __tsan_atomic_thread_fence(morder mo) {
-  char* a = 0;
-  SCOPED_ATOMIC(Fence, mo);
-}
+void __tsan_atomic_thread_fence(morder mo) { ATOMIC_IMPL(Fence, mo); }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic_signal_fence(morder mo) {
@@ -867,25 +840,23 @@ void __tsan_atomic_signal_fence(morder mo) {
 
 // Go
 
-#define ATOMIC(func, ...) \
-    if (thr->ignore_sync) { \
-      NoTsanAtomic##func(__VA_ARGS__); \
-    } else { \
-      FuncEntry(thr, cpc); \
+#  define ATOMIC(func, ...)               \
+    if (thr->ignore_sync) {               \
+      NoTsanAtomic##func(__VA_ARGS__);    \
+    } else {                              \
+      FuncEntry(thr, cpc);                \
       Atomic##func(thr, pc, __VA_ARGS__); \
-      FuncExit(thr); \
-    } \
-/**/
-
-#define ATOMIC_RET(func, ret, ...) \
-    if (thr->ignore_sync) { \
-      (ret) = NoTsanAtomic##func(__VA_ARGS__); \
-    } else { \
-      FuncEntry(thr, cpc); \
+      FuncExit(thr);                      \
+    }
+
+#  define ATOMIC_RET(func, ret, ...)              \
+    if (thr->ignore_sync) {                       \
+      (ret) = NoTsanAtomic##func(__VA_ARGS__);    \
+    } else {                                      \
+      FuncEntry(thr, cpc);                        \
       (ret) = Atomic##func(thr, pc, __VA_ARGS__); \
-      FuncExit(thr); \
-    } \
-/**/
+      FuncExit(thr);                              \
+    }
 
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE
diff --git a/libsanitizer/tsan/tsan_interface_inl.h b/libsanitizer/tsan/tsan_interface_inl.h
deleted file mode 100644
index 5e77d4d..0000000
--- a/libsanitizer/tsan/tsan_interface_inl.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===-- tsan_interface_inl.h ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-//===----------------------------------------------------------------------===//
-
-#include "tsan_interface.h"
-#include "tsan_rtl.h"
-#include "sanitizer_common/sanitizer_ptrauth.h"
-
-#define CALLERPC ((uptr)__builtin_return_address(0))
-
-using namespace __tsan;
-
-void __tsan_read1(void *addr) {
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr, kSizeLog1);
-}
-
-void __tsan_read2(void *addr) {
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr, kSizeLog2);
-}
-
-void __tsan_read4(void *addr) {
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr, kSizeLog4);
-}
-
-void __tsan_read8(void *addr) {
-  MemoryRead(cur_thread(), CALLERPC, (uptr)addr, kSizeLog8);
-}
-
-void __tsan_write1(void *addr) {
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr, kSizeLog1);
-}
-
-void __tsan_write2(void *addr) {
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr, kSizeLog2);
-}
-
-void __tsan_write4(void *addr) {
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr, kSizeLog4);
-}
-
-void __tsan_write8(void *addr) {
-  MemoryWrite(cur_thread(), CALLERPC, (uptr)addr, kSizeLog8);
-}
-
-void __tsan_read1_pc(void *addr, void *pc) {
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1);
-}
-
-void __tsan_read2_pc(void *addr, void *pc) {
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2);
-}
-
-void __tsan_read4_pc(void *addr, void *pc) {
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4);
-}
-
-void __tsan_read8_pc(void *addr, void *pc) {
-  MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8);
-}
-
-void __tsan_write1_pc(void *addr, void *pc) {
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1);
-}
-
-void __tsan_write2_pc(void *addr, void *pc) {
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2);
-}
-
-void __tsan_write4_pc(void *addr, void *pc) {
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4);
-}
-
-void __tsan_write8_pc(void *addr, void *pc) {
-  MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8);
-}
-
-void __tsan_vptr_update(void **vptr_p, void *new_val) {
-  CHECK_EQ(sizeof(vptr_p), 8);
-  if (*vptr_p != new_val) {
-    ThreadState *thr = cur_thread();
-    thr->is_vptr_access = true;
-    MemoryWrite(thr, CALLERPC, (uptr)vptr_p, kSizeLog8);
-    thr->is_vptr_access = false;
-  }
-}
-
-void __tsan_vptr_read(void **vptr_p) {
-  CHECK_EQ(sizeof(vptr_p), 8);
-  ThreadState *thr = cur_thread();
-  thr->is_vptr_access = true;
-  MemoryRead(thr, CALLERPC, (uptr)vptr_p, kSizeLog8);
-  thr->is_vptr_access = false;
-}
-
-void __tsan_func_entry(void *pc) {
-  FuncEntry(cur_thread(), STRIP_PAC_PC(pc));
-}
-
-void __tsan_func_exit() {
-  FuncExit(cur_thread());
-}
-
-void __tsan_ignore_thread_begin() {
-  ThreadIgnoreBegin(cur_thread(), CALLERPC);
-}
-
-void __tsan_ignore_thread_end() {
-  ThreadIgnoreEnd(cur_thread(), CALLERPC);
-}
-
-void __tsan_read_range(void *addr, uptr size) {
-  MemoryAccessRange(cur_thread(), CALLERPC, (uptr)addr, size, false);
-}
-
-void __tsan_write_range(void *addr, uptr size) {
-  MemoryAccessRange(cur_thread(), CALLERPC, (uptr)addr, size, true);
-}
-
-void __tsan_read_range_pc(void *addr, uptr size, void *pc) {
-  MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, false);
-}
-
-void __tsan_write_range_pc(void *addr, uptr size, void *pc) {
-  MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, true);
-}
diff --git a/libsanitizer/tsan/tsan_interface_java.cpp b/libsanitizer/tsan/tsan_interface_java.cpp
index 081c6ff..c090c1f 100644
--- a/libsanitizer/tsan/tsan_interface_java.cpp
+++ b/libsanitizer/tsan/tsan_interface_java.cpp
@@ -12,7 +12,6 @@
 
 #include "tsan_interface_java.h"
 #include "tsan_rtl.h"
-#include "tsan_mutex.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
@@ -35,52 +34,49 @@ struct JavaContext {
   }
 };
 
-class ScopedJavaFunc {
- public:
-  ScopedJavaFunc(ThreadState *thr, uptr pc)
-      : thr_(thr) {
-    Initialize(thr_);
-    FuncEntry(thr, pc);
-  }
-
-  ~ScopedJavaFunc() {
-    FuncExit(thr_);
-    // FIXME(dvyukov): process pending signals.
-  }
-
- private:
-  ThreadState *thr_;
-};
-
 static u64 jctx_buf[sizeof(JavaContext) / sizeof(u64) + 1];
 static JavaContext *jctx;
 
+MBlock *JavaHeapBlock(uptr addr, uptr *start) {
+  if (!jctx || addr < jctx->heap_begin ||
+      addr >= jctx->heap_begin + jctx->heap_size)
+    return nullptr;
+  for (uptr p = RoundDown(addr, kMetaShadowCell); p >= jctx->heap_begin;
+       p -= kMetaShadowCell) {
+    MBlock *b = ctx->metamap.GetBlock(p);
+    if (!b)
+      continue;
+    if (p + b->siz <= addr)
+      return nullptr;
+    *start = p;
+    return b;
+  }
+  return nullptr;
+}
+
 }  // namespace __tsan
 
-#define SCOPED_JAVA_FUNC(func) \
+#define JAVA_FUNC_ENTER(func)      \
   ThreadState *thr = cur_thread(); \
-  const uptr caller_pc = GET_CALLER_PC(); \
-  const uptr pc = StackTrace::GetCurrentPc(); \
-  (void)pc; \
-  ScopedJavaFunc scoped(thr, caller_pc); \
-/**/
+  (void)thr;
 
 void __tsan_java_init(jptr heap_begin, jptr heap_size) {
-  SCOPED_JAVA_FUNC(__tsan_java_init);
-  DPrintf("#%d: java_init(%p, %p)\n", thr->tid, heap_begin, heap_size);
-  CHECK_EQ(jctx, 0);
-  CHECK_GT(heap_begin, 0);
-  CHECK_GT(heap_size, 0);
-  CHECK_EQ(heap_begin % kHeapAlignment, 0);
-  CHECK_EQ(heap_size % kHeapAlignment, 0);
-  CHECK_LT(heap_begin, heap_begin + heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_init);
+  Initialize(thr);
+  DPrintf("#%d: java_init(0x%zx, 0x%zx)\n", thr->tid, heap_begin, heap_size);
+  DCHECK_EQ(jctx, 0);
+  DCHECK_GT(heap_begin, 0);
+  DCHECK_GT(heap_size, 0);
+  DCHECK_EQ(heap_begin % kHeapAlignment, 0);
+  DCHECK_EQ(heap_size % kHeapAlignment, 0);
+  DCHECK_LT(heap_begin, heap_begin + heap_size);
   jctx = new(jctx_buf) JavaContext(heap_begin, heap_size);
 }
 
 int  __tsan_java_fini() {
-  SCOPED_JAVA_FUNC(__tsan_java_fini);
+  JAVA_FUNC_ENTER(__tsan_java_fini);
   DPrintf("#%d: java_fini()\n", thr->tid);
-  CHECK_NE(jctx, 0);
+  DCHECK_NE(jctx, 0);
   // FIXME(dvyukov): this does not call atexit() callbacks.
   int status = Finalize(thr);
   DPrintf("#%d: java_fini() = %d\n", thr->tid, status);
@@ -88,74 +84,65 @@ int  __tsan_java_fini() {
 }
 
 void __tsan_java_alloc(jptr ptr, jptr size) {
-  SCOPED_JAVA_FUNC(__tsan_java_alloc);
-  DPrintf("#%d: java_alloc(%p, %p)\n", thr->tid, ptr, size);
-  CHECK_NE(jctx, 0);
-  CHECK_NE(size, 0);
-  CHECK_EQ(ptr % kHeapAlignment, 0);
-  CHECK_EQ(size % kHeapAlignment, 0);
-  CHECK_GE(ptr, jctx->heap_begin);
-  CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
-
-  OnUserAlloc(thr, pc, ptr, size, false);
+  JAVA_FUNC_ENTER(__tsan_java_alloc);
+  DPrintf("#%d: java_alloc(0x%zx, 0x%zx)\n", thr->tid, ptr, size);
+  DCHECK_NE(jctx, 0);
+  DCHECK_NE(size, 0);
+  DCHECK_EQ(ptr % kHeapAlignment, 0);
+  DCHECK_EQ(size % kHeapAlignment, 0);
+  DCHECK_GE(ptr, jctx->heap_begin);
+  DCHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
+
+  OnUserAlloc(thr, 0, ptr, size, false);
 }
 
 void __tsan_java_free(jptr ptr, jptr size) {
-  SCOPED_JAVA_FUNC(__tsan_java_free);
-  DPrintf("#%d: java_free(%p, %p)\n", thr->tid, ptr, size);
-  CHECK_NE(jctx, 0);
-  CHECK_NE(size, 0);
-  CHECK_EQ(ptr % kHeapAlignment, 0);
-  CHECK_EQ(size % kHeapAlignment, 0);
-  CHECK_GE(ptr, jctx->heap_begin);
-  CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_free);
+  DPrintf("#%d: java_free(0x%zx, 0x%zx)\n", thr->tid, ptr, size);
+  DCHECK_NE(jctx, 0);
+  DCHECK_NE(size, 0);
+  DCHECK_EQ(ptr % kHeapAlignment, 0);
+  DCHECK_EQ(size % kHeapAlignment, 0);
+  DCHECK_GE(ptr, jctx->heap_begin);
+  DCHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
   ctx->metamap.FreeRange(thr->proc(), ptr, size);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
-  SCOPED_JAVA_FUNC(__tsan_java_move);
-  DPrintf("#%d: java_move(%p, %p, %p)\n", thr->tid, src, dst, size);
-  CHECK_NE(jctx, 0);
-  CHECK_NE(size, 0);
-  CHECK_EQ(src % kHeapAlignment, 0);
-  CHECK_EQ(dst % kHeapAlignment, 0);
-  CHECK_EQ(size % kHeapAlignment, 0);
-  CHECK_GE(src, jctx->heap_begin);
-  CHECK_LE(src + size, jctx->heap_begin + jctx->heap_size);
-  CHECK_GE(dst, jctx->heap_begin);
-  CHECK_LE(dst + size, jctx->heap_begin + jctx->heap_size);
-  CHECK_NE(dst, src);
-  CHECK_NE(size, 0);
+  JAVA_FUNC_ENTER(__tsan_java_move);
+  DPrintf("#%d: java_move(0x%zx, 0x%zx, 0x%zx)\n", thr->tid, src, dst, size);
+  DCHECK_NE(jctx, 0);
+  DCHECK_NE(size, 0);
+  DCHECK_EQ(src % kHeapAlignment, 0);
+  DCHECK_EQ(dst % kHeapAlignment, 0);
+  DCHECK_EQ(size % kHeapAlignment, 0);
+  DCHECK_GE(src, jctx->heap_begin);
+  DCHECK_LE(src + size, jctx->heap_begin + jctx->heap_size);
+  DCHECK_GE(dst, jctx->heap_begin);
+  DCHECK_LE(dst + size, jctx->heap_begin + jctx->heap_size);
+  DCHECK_NE(dst, src);
+  DCHECK_NE(size, 0);
 
   // Assuming it's not running concurrently with threads that do
   // memory accesses and mutex operations (stop-the-world phase).
   ctx->metamap.MoveMemory(src, dst, size);
 
-  // Move shadow.
-  u64 *s = (u64*)MemToShadow(src);
-  u64 *d = (u64*)MemToShadow(dst);
-  u64 *send = (u64*)MemToShadow(src + size);
-  uptr inc = 1;
-  if (dst > src) {
-    s = (u64*)MemToShadow(src + size) - 1;
-    d = (u64*)MemToShadow(dst + size) - 1;
-    send = (u64*)MemToShadow(src) - 1;
-    inc = -1;
-  }
-  for (; s != send; s += inc, d += inc) {
-    *d = *s;
-    *s = 0;
-  }
+  // Clear the destination shadow range.
+  // We used to move shadow from src to dst, but the trace format does not
+  // support that anymore as it contains addresses of accesses.
+  RawShadow *d = MemToShadow(dst);
+  RawShadow *dend = MemToShadow(dst + size);
+  internal_memset(d, 0, (dend - d) * sizeof(*d));
 }
 
 jptr __tsan_java_find(jptr *from_ptr, jptr to) {
-  SCOPED_JAVA_FUNC(__tsan_java_find);
-  DPrintf("#%d: java_find(&%p, %p)\n", *from_ptr, to);
-  CHECK_EQ((*from_ptr) % kHeapAlignment, 0);
-  CHECK_EQ(to % kHeapAlignment, 0);
-  CHECK_GE(*from_ptr, jctx->heap_begin);
-  CHECK_LE(to, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_find);
+  DPrintf("#%d: java_find(&0x%zx, 0x%zx)\n", thr->tid, *from_ptr, to);
+  DCHECK_EQ((*from_ptr) % kHeapAlignment, 0);
+  DCHECK_EQ(to % kHeapAlignment, 0);
+  DCHECK_GE(*from_ptr, jctx->heap_begin);
+  DCHECK_LE(to, jctx->heap_begin + jctx->heap_size);
   for (uptr from = *from_ptr; from < to; from += kHeapAlignment) {
     MBlock *b = ctx->metamap.GetBlock(from);
     if (b) {
@@ -167,101 +154,105 @@ jptr __tsan_java_find(jptr *from_ptr, jptr to) {
 }
 
 void __tsan_java_finalize() {
-  SCOPED_JAVA_FUNC(__tsan_java_finalize);
-  DPrintf("#%d: java_mutex_finalize()\n", thr->tid);
-  AcquireGlobal(thr, 0);
+  JAVA_FUNC_ENTER(__tsan_java_finalize);
+  DPrintf("#%d: java_finalize()\n", thr->tid);
+  AcquireGlobal(thr);
 }
 
 void __tsan_java_mutex_lock(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_lock);
-  DPrintf("#%d: java_mutex_lock(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
-
-  MutexPostLock(thr, pc, addr, MutexFlagLinkerInit | MutexFlagWriteReentrant |
-      MutexFlagDoPreLockOnPostLock);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_lock);
+  DPrintf("#%d: java_mutex_lock(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+
+  MutexPostLock(thr, 0, addr,
+                MutexFlagLinkerInit | MutexFlagWriteReentrant |
+                    MutexFlagDoPreLockOnPostLock);
 }
 
 void __tsan_java_mutex_unlock(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_unlock);
-  DPrintf("#%d: java_mutex_unlock(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_unlock);
+  DPrintf("#%d: java_mutex_unlock(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  MutexUnlock(thr, pc, addr);
+  MutexUnlock(thr, 0, addr);
 }
 
 void __tsan_java_mutex_read_lock(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_read_lock);
-  DPrintf("#%d: java_mutex_read_lock(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
-
-  MutexPostReadLock(thr, pc, addr, MutexFlagLinkerInit |
-      MutexFlagWriteReentrant | MutexFlagDoPreLockOnPostLock);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_read_lock);
+  DPrintf("#%d: java_mutex_read_lock(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+
+  MutexPostReadLock(thr, 0, addr,
+                    MutexFlagLinkerInit | MutexFlagWriteReentrant |
+                        MutexFlagDoPreLockOnPostLock);
 }
 
 void __tsan_java_mutex_read_unlock(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_read_unlock);
-  DPrintf("#%d: java_mutex_read_unlock(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_read_unlock);
+  DPrintf("#%d: java_mutex_read_unlock(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  MutexReadUnlock(thr, pc, addr);
+  MutexReadUnlock(thr, 0, addr);
 }
 
 void __tsan_java_mutex_lock_rec(jptr addr, int rec) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_lock_rec);
-  DPrintf("#%d: java_mutex_lock_rec(%p, %d)\n", thr->tid, addr, rec);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
-  CHECK_GT(rec, 0);
-
-  MutexPostLock(thr, pc, addr, MutexFlagLinkerInit | MutexFlagWriteReentrant |
-      MutexFlagDoPreLockOnPostLock | MutexFlagRecursiveLock, rec);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_lock_rec);
+  DPrintf("#%d: java_mutex_lock_rec(0x%zx, %d)\n", thr->tid, addr, rec);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  DCHECK_GT(rec, 0);
+
+  MutexPostLock(thr, 0, addr,
+                MutexFlagLinkerInit | MutexFlagWriteReentrant |
+                    MutexFlagDoPreLockOnPostLock | MutexFlagRecursiveLock,
+                rec);
 }
 
 int __tsan_java_mutex_unlock_rec(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_mutex_unlock_rec);
-  DPrintf("#%d: java_mutex_unlock_rec(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_mutex_unlock_rec);
+  DPrintf("#%d: java_mutex_unlock_rec(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  return MutexUnlock(thr, pc, addr, MutexFlagRecursiveUnlock);
+  return MutexUnlock(thr, 0, addr, MutexFlagRecursiveUnlock);
 }
 
 void __tsan_java_acquire(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_acquire);
-  DPrintf("#%d: java_acquire(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_acquire);
+  DPrintf("#%d: java_acquire(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  Acquire(thr, caller_pc, addr);
+  Acquire(thr, 0, addr);
 }
 
 void __tsan_java_release(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_release);
-  DPrintf("#%d: java_release(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_release);
+  DPrintf("#%d: java_release(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  Release(thr, caller_pc, addr);
+  Release(thr, 0, addr);
 }
 
 void __tsan_java_release_store(jptr addr) {
-  SCOPED_JAVA_FUNC(__tsan_java_release);
-  DPrintf("#%d: java_release_store(%p)\n", thr->tid, addr);
-  CHECK_NE(jctx, 0);
-  CHECK_GE(addr, jctx->heap_begin);
-  CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
+  JAVA_FUNC_ENTER(__tsan_java_release);
+  DPrintf("#%d: java_release_store(0x%zx)\n", thr->tid, addr);
+  DCHECK_NE(jctx, 0);
+  DCHECK_GE(addr, jctx->heap_begin);
+  DCHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  ReleaseStore(thr, caller_pc, addr);
+  ReleaseStore(thr, 0, addr);
 }
diff --git a/libsanitizer/tsan/tsan_mman.cpp b/libsanitizer/tsan/tsan_mman.cpp
index 7765bc0..f1b6768 100644
--- a/libsanitizer/tsan/tsan_mman.cpp
+++ b/libsanitizer/tsan/tsan_mman.cpp
@@ -148,7 +148,7 @@ static void SignalUnsafeCall(ThreadState *thr, uptr pc) {
   ObtainCurrentStack(thr, pc, &stack);
   if (IsFiredSuppression(ctx, ReportTypeSignalUnsafe, stack))
     return;
-  ThreadRegistryLock l(ctx->thread_registry);
+  ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(ReportTypeSignalUnsafe);
   rep.AddStack(stack, true);
   OutputReport(thr, rep);
@@ -218,7 +218,7 @@ void *user_reallocarray(ThreadState *thr, uptr pc, void *p, uptr size, uptr n) {
 }
 
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
-  DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
+  DPrintf("#%d: alloc(%zu) = 0x%zx\n", thr->tid, sz, p);
   ctx->metamap.AllocBlock(thr, pc, p, sz);
   if (write && thr->ignore_reads_and_writes == 0)
     MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
@@ -229,7 +229,7 @@ void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
   CHECK_NE(p, (void*)0);
   uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
-  DPrintf("#%d: free(%p, %zu)\n", thr->tid, p, sz);
+  DPrintf("#%d: free(0x%zx, %zu)\n", thr->tid, p, sz);
   if (write && thr->ignore_reads_and_writes == 0)
     MemoryRangeFreed(thr, pc, (uptr)p, sz);
 }
@@ -336,7 +336,7 @@ void invoke_free_hook(void *ptr) {
   RunFreeHooks(ptr);
 }
 
-void *internal_alloc(MBlockType typ, uptr sz) {
+void *Alloc(uptr sz) {
   ThreadState *thr = cur_thread();
   if (thr->nomalloc) {
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
@@ -345,7 +345,7 @@ void *internal_alloc(MBlockType typ, uptr sz) {
   return InternalAlloc(sz, &thr->proc()->internal_alloc_cache);
 }
 
-void internal_free(void *p) {
+void FreeImpl(void *p) {
   ThreadState *thr = cur_thread();
   if (thr->nomalloc) {
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
diff --git a/libsanitizer/tsan/tsan_mman.h b/libsanitizer/tsan/tsan_mman.h
index a5280d4..efea5e5 100644
--- a/libsanitizer/tsan/tsan_mman.h
+++ b/libsanitizer/tsan/tsan_mman.h
@@ -47,42 +47,29 @@ uptr user_alloc_usable_size(const void *p);
 void invoke_malloc_hook(void *ptr, uptr size);
 void invoke_free_hook(void *ptr);
 
-enum MBlockType {
-  MBlockScopedBuf,
-  MBlockString,
-  MBlockStackTrace,
-  MBlockShadowStack,
-  MBlockSync,
-  MBlockClock,
-  MBlockThreadContex,
-  MBlockDeadInfo,
-  MBlockRacyStacks,
-  MBlockRacyAddresses,
-  MBlockAtExit,
-  MBlockFlag,
-  MBlockReport,
-  MBlockReportMop,
-  MBlockReportThread,
-  MBlockReportMutex,
-  MBlockReportLoc,
-  MBlockReportStack,
-  MBlockSuppression,
-  MBlockExpectRace,
-  MBlockSignal,
-  MBlockJmpBuf,
+// For internal data structures.
+void *Alloc(uptr sz);
+void FreeImpl(void *p);
 
-  // This must be the last.
-  MBlockTypeCount
-};
+template <typename T, typename... Args>
+T *New(Args &&...args) {
+  return new (Alloc(sizeof(T))) T(static_cast<Args &&>(args)...);
+}
 
-// For internal data structures.
-void *internal_alloc(MBlockType typ, uptr sz);
-void internal_free(void *p);
+template <typename T>
+void Free(T *&p) {
+  if (p == nullptr)
+    return;
+  FreeImpl(p);
+  p = nullptr;
+}
 
 template <typename T>
-void DestroyAndFree(T *p) {
+void DestroyAndFree(T *&p) {
+  if (p == nullptr)
+    return;
   p->~T();
-  internal_free(p);
+  Free(p);
 }
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_mutex.cpp b/libsanitizer/tsan/tsan_mutex.cpp
deleted file mode 100644
index d8b1826..0000000
--- a/libsanitizer/tsan/tsan_mutex.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===-- tsan_mutex.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_libc.h"
-#include "tsan_mutex.h"
-#include "tsan_platform.h"
-#include "tsan_rtl.h"
-
-namespace __tsan {
-
-// Simple reader-writer spin-mutex. Optimized for not-so-contended case.
-// Readers have preference, can possibly starvate writers.
-
-// The table fixes what mutexes can be locked under what mutexes.
-// E.g. if the row for MutexTypeThreads contains MutexTypeReport,
-// then Report mutex can be locked while under Threads mutex.
-// The leaf mutexes can be locked under any other mutexes.
-// Recursive locking is not supported.
-#if SANITIZER_DEBUG && !SANITIZER_GO
-const MutexType MutexTypeLeaf = (MutexType)-1;
-static MutexType CanLockTab[MutexTypeCount][MutexTypeCount] = {
-  /*0  MutexTypeInvalid*/     {},
-  /*1  MutexTypeTrace*/       {MutexTypeLeaf},
-  /*2  MutexTypeThreads*/     {MutexTypeReport},
-  /*3  MutexTypeReport*/      {MutexTypeSyncVar,
-                               MutexTypeMBlock, MutexTypeJavaMBlock},
-  /*4  MutexTypeSyncVar*/     {MutexTypeDDetector},
-  /*5  MutexTypeSyncTab*/     {},  // unused
-  /*6  MutexTypeSlab*/        {MutexTypeLeaf},
-  /*7  MutexTypeAnnotations*/ {},
-  /*8  MutexTypeAtExit*/      {MutexTypeSyncVar},
-  /*9  MutexTypeMBlock*/      {MutexTypeSyncVar},
-  /*10 MutexTypeJavaMBlock*/  {MutexTypeSyncVar},
-  /*11 MutexTypeDDetector*/   {},
-  /*12 MutexTypeFired*/       {MutexTypeLeaf},
-  /*13 MutexTypeRacy*/        {MutexTypeLeaf},
-  /*14 MutexTypeGlobalProc*/  {},
-};
-
-static bool CanLockAdj[MutexTypeCount][MutexTypeCount];
-#endif
-
-void InitializeMutex() {
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  // Build the "can lock" adjacency matrix.
-  // If [i][j]==true, then one can lock mutex j while under mutex i.
-  const int N = MutexTypeCount;
-  int cnt[N] = {};
-  bool leaf[N] = {};
-  for (int i = 1; i < N; i++) {
-    for (int j = 0; j < N; j++) {
-      MutexType z = CanLockTab[i][j];
-      if (z == MutexTypeInvalid)
-        continue;
-      if (z == MutexTypeLeaf) {
-        CHECK(!leaf[i]);
-        leaf[i] = true;
-        continue;
-      }
-      CHECK(!CanLockAdj[i][(int)z]);
-      CanLockAdj[i][(int)z] = true;
-      cnt[i]++;
-    }
-  }
-  for (int i = 0; i < N; i++) {
-    CHECK(!leaf[i] || cnt[i] == 0);
-  }
-  // Add leaf mutexes.
-  for (int i = 0; i < N; i++) {
-    if (!leaf[i])
-      continue;
-    for (int j = 0; j < N; j++) {
-      if (i == j || leaf[j] || j == MutexTypeInvalid)
-        continue;
-      CHECK(!CanLockAdj[j][i]);
-      CanLockAdj[j][i] = true;
-    }
-  }
-  // Build the transitive closure.
-  bool CanLockAdj2[MutexTypeCount][MutexTypeCount];
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < N; j++) {
-      CanLockAdj2[i][j] = CanLockAdj[i][j];
-    }
-  }
-  for (int k = 0; k < N; k++) {
-    for (int i = 0; i < N; i++) {
-      for (int j = 0; j < N; j++) {
-        if (CanLockAdj2[i][k] && CanLockAdj2[k][j]) {
-          CanLockAdj2[i][j] = true;
-        }
-      }
-    }
-  }
-#if 0
-  Printf("Can lock graph:\n");
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < N; j++) {
-      Printf("%d ", CanLockAdj[i][j]);
-    }
-    Printf("\n");
-  }
-  Printf("Can lock graph closure:\n");
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < N; j++) {
-      Printf("%d ", CanLockAdj2[i][j]);
-    }
-    Printf("\n");
-  }
-#endif
-  // Verify that the graph is acyclic.
-  for (int i = 0; i < N; i++) {
-    if (CanLockAdj2[i][i]) {
-      Printf("Mutex %d participates in a cycle\n", i);
-      Die();
-    }
-  }
-#endif
-}
-
-InternalDeadlockDetector::InternalDeadlockDetector() {
-  // Rely on zero initialization because some mutexes can be locked before ctor.
-}
-
-#if SANITIZER_DEBUG && !SANITIZER_GO
-void InternalDeadlockDetector::Lock(MutexType t) {
-  // Printf("LOCK %d @%zu\n", t, seq_ + 1);
-  CHECK_GT(t, MutexTypeInvalid);
-  CHECK_LT(t, MutexTypeCount);
-  u64 max_seq = 0;
-  u64 max_idx = MutexTypeInvalid;
-  for (int i = 0; i != MutexTypeCount; i++) {
-    if (locked_[i] == 0)
-      continue;
-    CHECK_NE(locked_[i], max_seq);
-    if (max_seq < locked_[i]) {
-      max_seq = locked_[i];
-      max_idx = i;
-    }
-  }
-  locked_[t] = ++seq_;
-  if (max_idx == MutexTypeInvalid)
-    return;
-  // Printf("  last %d @%zu\n", max_idx, max_seq);
-  if (!CanLockAdj[max_idx][t]) {
-    Printf("ThreadSanitizer: internal deadlock detected\n");
-    Printf("ThreadSanitizer: can't lock %d while under %zu\n",
-               t, (uptr)max_idx);
-    CHECK(0);
-  }
-}
-
-void InternalDeadlockDetector::Unlock(MutexType t) {
-  // Printf("UNLO %d @%zu #%zu\n", t, seq_, locked_[t]);
-  CHECK(locked_[t]);
-  locked_[t] = 0;
-}
-
-void InternalDeadlockDetector::CheckNoLocks() {
-  for (int i = 0; i != MutexTypeCount; i++) {
-    CHECK_EQ(locked_[i], 0);
-  }
-}
-#endif
-
-void CheckNoLocks(ThreadState *thr) {
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  thr->internal_deadlock_detector.CheckNoLocks();
-#endif
-}
-
-const uptr kUnlocked = 0;
-const uptr kWriteLock = 1;
-const uptr kReadLock = 2;
-
-class Backoff {
- public:
-  Backoff()
-    : iter_() {
-  }
-
-  bool Do() {
-    if (iter_++ < kActiveSpinIters)
-      proc_yield(kActiveSpinCnt);
-    else
-      internal_sched_yield();
-    return true;
-  }
-
-  u64 Contention() const {
-    u64 active = iter_ % kActiveSpinIters;
-    u64 passive = iter_ - active;
-    return active + 10 * passive;
-  }
-
- private:
-  int iter_;
-  static const int kActiveSpinIters = 10;
-  static const int kActiveSpinCnt = 20;
-};
-
-Mutex::Mutex(MutexType type) {
-  CHECK_GT(type, MutexTypeInvalid);
-  CHECK_LT(type, MutexTypeCount);
-#if SANITIZER_DEBUG
-  type_ = type;
-#endif
-  atomic_store(&state_, kUnlocked, memory_order_relaxed);
-}
-
-Mutex::~Mutex() {
-  CHECK_EQ(atomic_load(&state_, memory_order_relaxed), kUnlocked);
-}
-
-void Mutex::Lock() {
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  cur_thread()->internal_deadlock_detector.Lock(type_);
-#endif
-  uptr cmp = kUnlocked;
-  if (atomic_compare_exchange_strong(&state_, &cmp, kWriteLock,
-                                     memory_order_acquire))
-    return;
-  for (Backoff backoff; backoff.Do();) {
-    if (atomic_load(&state_, memory_order_relaxed) == kUnlocked) {
-      cmp = kUnlocked;
-      if (atomic_compare_exchange_weak(&state_, &cmp, kWriteLock,
-                                       memory_order_acquire)) {
-        return;
-      }
-    }
-  }
-}
-
-void Mutex::Unlock() {
-  uptr prev = atomic_fetch_sub(&state_, kWriteLock, memory_order_release);
-  (void)prev;
-  DCHECK_NE(prev & kWriteLock, 0);
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  cur_thread()->internal_deadlock_detector.Unlock(type_);
-#endif
-}
-
-void Mutex::ReadLock() {
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  cur_thread()->internal_deadlock_detector.Lock(type_);
-#endif
-  uptr prev = atomic_fetch_add(&state_, kReadLock, memory_order_acquire);
-  if ((prev & kWriteLock) == 0)
-    return;
-  for (Backoff backoff; backoff.Do();) {
-    prev = atomic_load(&state_, memory_order_acquire);
-    if ((prev & kWriteLock) == 0) {
-      return;
-    }
-  }
-}
-
-void Mutex::ReadUnlock() {
-  uptr prev = atomic_fetch_sub(&state_, kReadLock, memory_order_release);
-  (void)prev;
-  DCHECK_EQ(prev & kWriteLock, 0);
-  DCHECK_GT(prev & ~kWriteLock, 0);
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  cur_thread()->internal_deadlock_detector.Unlock(type_);
-#endif
-}
-
-void Mutex::CheckLocked() {
-  CHECK_NE(atomic_load(&state_, memory_order_relaxed), 0);
-}
-
-}  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_mutex.h b/libsanitizer/tsan/tsan_mutex.h
deleted file mode 100644
index 9a579ea..0000000
--- a/libsanitizer/tsan/tsan_mutex.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//===-- tsan_mutex.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-//===----------------------------------------------------------------------===//
-#ifndef TSAN_MUTEX_H
-#define TSAN_MUTEX_H
-
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_mutex.h"
-#include "tsan_defs.h"
-
-namespace __tsan {
-
-enum MutexType {
-  MutexTypeInvalid,
-  MutexTypeTrace,
-  MutexTypeThreads,
-  MutexTypeReport,
-  MutexTypeSyncVar,
-  MutexTypeSyncTab,
-  MutexTypeSlab,
-  MutexTypeAnnotations,
-  MutexTypeAtExit,
-  MutexTypeMBlock,
-  MutexTypeJavaMBlock,
-  MutexTypeDDetector,
-  MutexTypeFired,
-  MutexTypeRacy,
-  MutexTypeGlobalProc,
-
-  // This must be the last.
-  MutexTypeCount
-};
-
-class Mutex {
- public:
-  explicit Mutex(MutexType type);
-  ~Mutex();
-
-  void Lock();
-  void Unlock();
-
-  void ReadLock();
-  void ReadUnlock();
-
-  void CheckLocked();
-
- private:
-  atomic_uintptr_t state_;
-#if SANITIZER_DEBUG
-  MutexType type_;
-#endif
-
-  Mutex(const Mutex&);
-  void operator = (const Mutex&);
-};
-
-typedef GenericScopedLock<Mutex> Lock;
-typedef GenericScopedReadLock<Mutex> ReadLock;
-
-class InternalDeadlockDetector {
- public:
-  InternalDeadlockDetector();
-  void Lock(MutexType t);
-  void Unlock(MutexType t);
-  void CheckNoLocks();
- private:
-  u64 seq_;
-  u64 locked_[MutexTypeCount];
-};
-
-void InitializeMutex();
-
-// Checks that the current thread does not hold any runtime locks
-// (e.g. when returning from an interceptor).
-void CheckNoLocks(ThreadState *thr);
-
-}  // namespace __tsan
-
-#endif  // TSAN_MUTEX_H
diff --git a/libsanitizer/tsan/tsan_mutexset.cpp b/libsanitizer/tsan/tsan_mutexset.cpp
index 813fa3b..efc0e41 100644
--- a/libsanitizer/tsan/tsan_mutexset.cpp
+++ b/libsanitizer/tsan/tsan_mutexset.cpp
@@ -14,11 +14,7 @@
 
 namespace __tsan {
 
-const uptr MutexSet::kMaxSize;
-
 MutexSet::MutexSet() {
-  size_ = 0;
-  internal_memset(&descs_, 0, sizeof(descs_));
 }
 
 void MutexSet::Add(u64 id, bool write, u64 epoch) {
@@ -44,9 +40,12 @@ void MutexSet::Add(u64 id, bool write, u64 epoch) {
     CHECK_EQ(size_, kMaxSize - 1);
   }
   // Add new mutex descriptor.
+  descs_[size_].addr = 0;
+  descs_[size_].stack_id = kInvalidStackID;
   descs_[size_].id = id;
   descs_[size_].write = write;
   descs_[size_].epoch = epoch;
+  descs_[size_].seq = seq_++;
   descs_[size_].count = 1;
   size_++;
 }
@@ -70,6 +69,46 @@ void MutexSet::Remove(u64 id) {
   }
 }
 
+void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
+  // Look up existing mutex with the same id.
+  for (uptr i = 0; i < size_; i++) {
+    if (descs_[i].addr == addr) {
+      descs_[i].count++;
+      descs_[i].seq = seq_++;
+      return;
+    }
+  }
+  // On overflow, find the oldest mutex and drop it.
+  if (size_ == kMaxSize) {
+    uptr min = 0;
+    for (uptr i = 0; i < size_; i++) {
+      if (descs_[i].seq < descs_[min].seq)
+        min = i;
+    }
+    RemovePos(min);
+    CHECK_EQ(size_, kMaxSize - 1);
+  }
+  // Add new mutex descriptor.
+  descs_[size_].addr = addr;
+  descs_[size_].stack_id = stack_id;
+  descs_[size_].id = 0;
+  descs_[size_].write = write;
+  descs_[size_].epoch = 0;
+  descs_[size_].seq = seq_++;
+  descs_[size_].count = 1;
+  size_++;
+}
+
+void MutexSet::DelAddr(uptr addr, bool destroy) {
+  for (uptr i = 0; i < size_; i++) {
+    if (descs_[i].addr == addr) {
+      if (destroy || --descs_[i].count == 0)
+        RemovePos(i);
+      return;
+    }
+  }
+}
+
 void MutexSet::RemovePos(uptr i) {
   CHECK_LT(i, size_);
   descs_[i] = descs_[size_ - 1];
diff --git a/libsanitizer/tsan/tsan_mutexset.h b/libsanitizer/tsan/tsan_mutexset.h
index d63881f..a448cee 100644
--- a/libsanitizer/tsan/tsan_mutexset.h
+++ b/libsanitizer/tsan/tsan_mutexset.h
@@ -21,12 +21,22 @@ class MutexSet {
  public:
   // Holds limited number of mutexes.
   // The oldest mutexes are discarded on overflow.
-  static const uptr kMaxSize = 16;
+  static constexpr uptr kMaxSize = 16;
   struct Desc {
+    uptr addr;
+    StackID stack_id;
     u64 id;
     u64 epoch;
-    int count;
+    u32 seq;
+    u32 count;
     bool write;
+
+    Desc() { internal_memset(this, 0, sizeof(*this)); }
+    Desc(const Desc& other) { *this = other; }
+    Desc& operator=(const MutexSet::Desc& other) {
+      internal_memcpy(this, &other, sizeof(*this));
+      return *this;
+    }
   };
 
   MutexSet();
@@ -34,21 +44,19 @@ class MutexSet {
   void Add(u64 id, bool write, u64 epoch);
   void Del(u64 id, bool write);
   void Remove(u64 id);  // Removes the mutex completely (if it's destroyed).
+  void AddAddr(uptr addr, StackID stack_id, bool write);
+  void DelAddr(uptr addr, bool destroy = false);
   uptr Size() const;
   Desc Get(uptr i) const;
 
-  void operator=(const MutexSet &other) {
-    internal_memcpy(this, &other, sizeof(*this));
-  }
-
  private:
 #if !SANITIZER_GO
-  uptr size_;
+  u32 seq_ = 0;
+  uptr size_ = 0;
   Desc descs_[kMaxSize];
-#endif
 
   void RemovePos(uptr i);
-  MutexSet(const MutexSet&);
+#endif
 };
 
 // Go does not have mutexes, so do not spend memory and time.
@@ -59,7 +67,8 @@ MutexSet::MutexSet() {}
 void MutexSet::Add(u64 id, bool write, u64 epoch) {}
 void MutexSet::Del(u64 id, bool write) {}
 void MutexSet::Remove(u64 id) {}
-void MutexSet::RemovePos(uptr i) {}
+void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {}
+void MutexSet::DelAddr(uptr addr, bool destroy) {}
 uptr MutexSet::Size() const { return 0; }
 MutexSet::Desc MutexSet::Get(uptr i) const { return Desc(); }
 #endif
diff --git a/libsanitizer/tsan/tsan_platform.h b/libsanitizer/tsan/tsan_platform.h
index 8bd218e..fc27a56 100644
--- a/libsanitizer/tsan/tsan_platform.h
+++ b/libsanitizer/tsan/tsan_platform.h
@@ -23,21 +23,19 @@
 
 namespace __tsan {
 
-#if defined(__x86_64__)
-#define HAS_48_BIT_ADDRESS_SPACE 1
-#elif SANITIZER_IOSSIM // arm64 iOS simulators (order of #if matters)
-#define HAS_48_BIT_ADDRESS_SPACE 1
-#elif SANITIZER_IOS // arm64 iOS devices (order of #if matters)
-#define HAS_48_BIT_ADDRESS_SPACE 0
-#elif SANITIZER_MAC // arm64 macOS (order of #if matters)
-#define HAS_48_BIT_ADDRESS_SPACE 1
-#else
-#define HAS_48_BIT_ADDRESS_SPACE 0
-#endif
-
-#if !SANITIZER_GO
+enum {
+  // App memory is not mapped onto shadow memory range.
+  kBrokenMapping = 1 << 0,
+  // Mapping app memory and back does not produce the same address,
+  // this can lead to wrong addresses in reports and potentially
+  // other bad consequences.
+  kBrokenReverseMapping = 1 << 1,
+  // Mapping is non-linear for linear user range.
+  // This is bad and can lead to unpredictable memory corruptions, etc
+  // because range access functions assume linearity.
+  kBrokenLinearity = 1 << 2,
+};
 
-#if HAS_48_BIT_ADDRESS_SPACE
 /*
 C/C++ on linux/x86_64 and freebsd/x86_64
 0000 0000 1000 - 0080 0000 0000: main binary and/or MAP_32BIT mappings (512GB)
@@ -65,9 +63,8 @@ C/C++ on netbsd/amd64 can reuse the same mapping:
  * Stack on NetBSD/amd64 has prereserved 128MB.
  * Heap grows downwards (top-down).
  * ASLR must be disabled per-process or globally.
-
 */
-struct Mapping {
+struct Mapping48AddressSpace {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
@@ -82,13 +79,12 @@ struct Mapping {
   static const uptr kMidAppMemEnd  = 0x568000000000ull;
   static const uptr kHiAppMemBeg   = 0x7e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x800000000000ull;
-  static const uptr kAppMemMsk     = 0x780000000000ull;
-  static const uptr kAppMemXor     = 0x040000000000ull;
+  static const uptr kShadowMsk = 0x780000000000ull;
+  static const uptr kShadowXor = 0x040000000000ull;
+  static const uptr kShadowAdd = 0x000000000000ull;
   static const uptr kVdsoBeg       = 0xf000000000000000ull;
 };
 
-#define TSAN_MID_APP_RANGE 1
-#elif defined(__mips64)
 /*
 C/C++ on linux/mips64 (40-bit VMA)
 0000 0000 00 - 0100 0000 00: -                                           (4 GB)
@@ -105,7 +101,7 @@ fe00 0000 00 - ff00 0000 00: heap                                        (4 GB)
 ff00 0000 00 - ff80 0000 00: -                                           (2 GB)
 ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
 */
-struct Mapping40 {
+struct MappingMips64_40 {
   static const uptr kMetaShadowBeg = 0x4000000000ull;
   static const uptr kMetaShadowEnd = 0x5000000000ull;
   static const uptr kTraceMemBeg   = 0xb000000000ull;
@@ -120,14 +116,12 @@ struct Mapping40 {
   static const uptr kMidAppMemEnd  = 0xab00000000ull;
   static const uptr kHiAppMemBeg   = 0xff80000000ull;
   static const uptr kHiAppMemEnd   = 0xffffffffffull;
-  static const uptr kAppMemMsk     = 0xf800000000ull;
-  static const uptr kAppMemXor     = 0x0800000000ull;
+  static const uptr kShadowMsk = 0xf800000000ull;
+  static const uptr kShadowXor = 0x0800000000ull;
+  static const uptr kShadowAdd = 0x0000000000ull;
   static const uptr kVdsoBeg       = 0xfffff00000ull;
 };
 
-#define TSAN_MID_APP_RANGE 1
-#define TSAN_RUNTIME_VMA 1
-#elif defined(__aarch64__) && defined(__APPLE__)
 /*
 C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
 0000 0000 00 - 0100 0000 00: -                                    (4 GB)
@@ -141,7 +135,7 @@ C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
 0f00 0000 00 - 0fc0 0000 00: traces                               (3 GB)
 0fc0 0000 00 - 1000 0000 00: -
 */
-struct Mapping {
+struct MappingAppleAarch64 {
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
   static const uptr kLoAppMemEnd   = 0x0200000000ull;
   static const uptr kHeapMemBeg    = 0x0200000000ull;
@@ -154,18 +148,14 @@ struct Mapping {
   static const uptr kTraceMemEnd   = 0x0fc0000000ull;
   static const uptr kHiAppMemBeg   = 0x0fc0000000ull;
   static const uptr kHiAppMemEnd   = 0x0fc0000000ull;
-  static const uptr kAppMemMsk     =          0x0ull;
-  static const uptr kAppMemXor     =          0x0ull;
+  static const uptr kShadowMsk = 0x0ull;
+  static const uptr kShadowXor = 0x0ull;
+  static const uptr kShadowAdd = 0x0ull;
   static const uptr kVdsoBeg       = 0x7000000000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
 };
 
-#elif defined(__aarch64__) && !defined(__APPLE__)
-// AArch64 supports multiple VMA which leads to multiple address transformation
-// functions.  To support these multiple VMAS transformations and mappings TSAN
-// runtime for AArch64 uses an external memory read (vmaSize) to select which
-// mapping to use.  Although slower, it make a same instrumented binary run on
-// multiple kernels.
-
 /*
 C/C++ on linux/aarch64 (39-bit VMA)
 0000 0010 00 - 0100 0000 00: main binary
@@ -181,7 +171,7 @@ C/C++ on linux/aarch64 (39-bit VMA)
 7c00 0000 00 - 7d00 0000 00: heap
 7d00 0000 00 - 7fff ffff ff: modules and main thread stack
 */
-struct Mapping39 {
+struct MappingAarch64_39 {
   static const uptr kLoAppMemBeg   = 0x0000001000ull;
   static const uptr kLoAppMemEnd   = 0x0100000000ull;
   static const uptr kShadowBeg     = 0x0800000000ull;
@@ -196,8 +186,9 @@ struct Mapping39 {
   static const uptr kHeapMemEnd    = 0x7d00000000ull;
   static const uptr kHiAppMemBeg   = 0x7e00000000ull;
   static const uptr kHiAppMemEnd   = 0x7fffffffffull;
-  static const uptr kAppMemMsk     = 0x7800000000ull;
-  static const uptr kAppMemXor     = 0x0200000000ull;
+  static const uptr kShadowMsk = 0x7800000000ull;
+  static const uptr kShadowXor = 0x0200000000ull;
+  static const uptr kShadowAdd = 0x0000000000ull;
   static const uptr kVdsoBeg       = 0x7f00000000ull;
 };
 
@@ -216,7 +207,8 @@ C/C++ on linux/aarch64 (42-bit VMA)
 3e000 0000 00 - 3f000 0000 00: heap
 3f000 0000 00 - 3ffff ffff ff: modules and main thread stack
 */
-struct Mapping42 {
+struct MappingAarch64_42 {
+  static const uptr kBroken = kBrokenReverseMapping;
   static const uptr kLoAppMemBeg   = 0x00000001000ull;
   static const uptr kLoAppMemEnd   = 0x01000000000ull;
   static const uptr kShadowBeg     = 0x10000000000ull;
@@ -231,12 +223,13 @@ struct Mapping42 {
   static const uptr kHeapMemEnd    = 0x3f000000000ull;
   static const uptr kHiAppMemBeg   = 0x3f000000000ull;
   static const uptr kHiAppMemEnd   = 0x3ffffffffffull;
-  static const uptr kAppMemMsk     = 0x3c000000000ull;
-  static const uptr kAppMemXor     = 0x04000000000ull;
+  static const uptr kShadowMsk = 0x3c000000000ull;
+  static const uptr kShadowXor = 0x04000000000ull;
+  static const uptr kShadowAdd = 0x00000000000ull;
   static const uptr kVdsoBeg       = 0x37f00000000ull;
 };
 
-struct Mapping48 {
+struct MappingAarch64_48 {
   static const uptr kLoAppMemBeg   = 0x0000000001000ull;
   static const uptr kLoAppMemEnd   = 0x0000200000000ull;
   static const uptr kShadowBeg     = 0x0002000000000ull;
@@ -251,22 +244,12 @@ struct Mapping48 {
   static const uptr kHeapMemEnd    = 0x0ffff00000000ull;
   static const uptr kHiAppMemBeg   = 0x0ffff00000000ull;
   static const uptr kHiAppMemEnd   = 0x1000000000000ull;
-  static const uptr kAppMemMsk     = 0x0fff800000000ull;
-  static const uptr kAppMemXor     = 0x0000800000000ull;
+  static const uptr kShadowMsk = 0x0fff800000000ull;
+  static const uptr kShadowXor = 0x0000800000000ull;
+  static const uptr kShadowAdd = 0x0000000000000ull;
   static const uptr kVdsoBeg       = 0xffff000000000ull;
 };
 
-// Indicates the runtime will define the memory regions at runtime.
-#define TSAN_RUNTIME_VMA 1
-// Indicates that mapping defines a mid range memory segment.
-#define TSAN_MID_APP_RANGE 1
-#elif defined(__powerpc64__)
-// PPC64 supports multiple VMA which leads to multiple address transformation
-// functions.  To support these multiple VMAS transformations and mappings TSAN
-// runtime for PPC64 uses an external memory read (vmaSize) to select which
-// mapping to use.  Although slower, it make a same instrumented binary run on
-// multiple kernels.
-
 /*
 C/C++ on linux/powerpc64 (44-bit VMA)
 0000 0000 0100 - 0001 0000 0000: main binary
@@ -281,7 +264,9 @@ C/C++ on linux/powerpc64 (44-bit VMA)
 0f50 0000 0000 - 0f60 0000 0000: -
 0f60 0000 0000 - 1000 0000 0000: modules and main thread stack
 */
-struct Mapping44 {
+struct MappingPPC64_44 {
+  static const uptr kBroken =
+      kBrokenMapping | kBrokenReverseMapping | kBrokenLinearity;
   static const uptr kMetaShadowBeg = 0x0b0000000000ull;
   static const uptr kMetaShadowEnd = 0x0d0000000000ull;
   static const uptr kTraceMemBeg   = 0x0d0000000000ull;
@@ -294,9 +279,12 @@ struct Mapping44 {
   static const uptr kHeapMemEnd    = 0x0f5000000000ull;
   static const uptr kHiAppMemBeg   = 0x0f6000000000ull;
   static const uptr kHiAppMemEnd   = 0x100000000000ull; // 44 bits
-  static const uptr kAppMemMsk     = 0x0f0000000000ull;
-  static const uptr kAppMemXor     = 0x002100000000ull;
+  static const uptr kShadowMsk = 0x0f0000000000ull;
+  static const uptr kShadowXor = 0x002100000000ull;
+  static const uptr kShadowAdd = 0x000000000000ull;
   static const uptr kVdsoBeg       = 0x3c0000000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
 };
 
 /*
@@ -313,7 +301,7 @@ C/C++ on linux/powerpc64 (46-bit VMA)
 3e00 0000 0000 - 3e80 0000 0000: -
 3e80 0000 0000 - 4000 0000 0000: modules and main thread stack
 */
-struct Mapping46 {
+struct MappingPPC64_46 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
   static const uptr kTraceMemBeg   = 0x200000000000ull;
@@ -326,9 +314,12 @@ struct Mapping46 {
   static const uptr kLoAppMemEnd   = 0x010000000000ull;
   static const uptr kHiAppMemBeg   = 0x3e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x400000000000ull; // 46 bits
-  static const uptr kAppMemMsk     = 0x3c0000000000ull;
-  static const uptr kAppMemXor     = 0x020000000000ull;
+  static const uptr kShadowMsk = 0x3c0000000000ull;
+  static const uptr kShadowXor = 0x020000000000ull;
+  static const uptr kShadowAdd = 0x000000000000ull;
   static const uptr kVdsoBeg       = 0x7800000000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
 };
 
 /*
@@ -345,7 +336,7 @@ C/C++ on linux/powerpc64 (47-bit VMA)
 7e00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 */
-struct Mapping47 {
+struct MappingPPC64_47 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
   static const uptr kTraceMemBeg   = 0x200000000000ull;
@@ -358,14 +349,14 @@ struct Mapping47 {
   static const uptr kLoAppMemEnd   = 0x010000000000ull;
   static const uptr kHiAppMemBeg   = 0x7e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x800000000000ull; // 47 bits
-  static const uptr kAppMemMsk     = 0x7c0000000000ull;
-  static const uptr kAppMemXor     = 0x020000000000ull;
+  static const uptr kShadowMsk = 0x7c0000000000ull;
+  static const uptr kShadowXor = 0x020000000000ull;
+  static const uptr kShadowAdd = 0x000000000000ull;
   static const uptr kVdsoBeg       = 0x7800000000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
 };
 
-// Indicates the runtime will define the memory regions at runtime.
-#define TSAN_RUNTIME_VMA 1
-#elif defined(__s390x__)
 /*
 C/C++ on linux/s390x
 While the kernel provides a 64-bit address space, we have to restrict ourselves
@@ -380,7 +371,7 @@ a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
 b000 0000 0000 - be00 0000 0000: -
 be00 0000 0000 - c000 0000 0000: heap - 2TiB (max supported by the allocator)
 */
-struct Mapping {
+struct MappingS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
   static const uptr kTraceMemBeg   = 0xa00000000000ull;
@@ -393,13 +384,13 @@ struct Mapping {
   static const uptr kLoAppMemEnd   = 0x0e0000000000ull;
   static const uptr kHiAppMemBeg   = 0xc00000004000ull;
   static const uptr kHiAppMemEnd   = 0xc00000004000ull;
-  static const uptr kAppMemMsk     = 0xb00000000000ull;
-  static const uptr kAppMemXor     = 0x100000000000ull;
+  static const uptr kShadowMsk = 0xb00000000000ull;
+  static const uptr kShadowXor = 0x100000000000ull;
+  static const uptr kShadowAdd = 0x000000000000ull;
   static const uptr kVdsoBeg       = 0xfffffffff000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
 };
-#endif
-
-#elif SANITIZER_GO && !SANITIZER_WINDOWS && HAS_48_BIT_ADDRESS_SPACE
 
 /* Go on linux, darwin and freebsd on x86_64
 0000 0000 1000 - 0000 1000 0000: executable
@@ -414,46 +405,59 @@ struct Mapping {
 6200 0000 0000 - 8000 0000 0000: -
 */
 
-struct Mapping {
+struct MappingGo48 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
   static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x238000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
 };
 
-#elif SANITIZER_GO && SANITIZER_WINDOWS
-
 /* Go on windows
 0000 0000 1000 - 0000 1000 0000: executable
 0000 1000 0000 - 00f8 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 0500 0000 0000: shadow
-0500 0000 0000 - 0560 0000 0000: -
-0560 0000 0000 - 0760 0000 0000: traces
-0760 0000 0000 - 07d0 0000 0000: metainfo (memory blocks and sync objects)
+0500 0000 0000 - 0700 0000 0000: traces
+0700 0000 0000 - 0770 0000 0000: metainfo (memory blocks and sync objects)
 07d0 0000 0000 - 8000 0000 0000: -
 */
 
-struct Mapping {
-  static const uptr kMetaShadowBeg = 0x076000000000ull;
-  static const uptr kMetaShadowEnd = 0x07d000000000ull;
-  static const uptr kTraceMemBeg   = 0x056000000000ull;
-  static const uptr kTraceMemEnd   = 0x076000000000ull;
+struct MappingGoWindows {
+  static const uptr kMetaShadowBeg = 0x070000000000ull;
+  static const uptr kMetaShadowEnd = 0x077000000000ull;
+  static const uptr kTraceMemBeg = 0x050000000000ull;
+  static const uptr kTraceMemEnd = 0x070000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x050000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x010000000000ull;
 };
 
-#elif SANITIZER_GO && defined(__powerpc64__)
-
-/* Only Mapping46 and Mapping47 are currently supported for powercp64 on Go. */
-
 /* Go on linux/powerpc64 (46-bit VMA)
 0000 0000 1000 - 0000 1000 0000: executable
 0000 1000 0000 - 00c0 0000 0000: -
@@ -467,15 +471,25 @@ struct Mapping {
 3800 0000 0000 - 4000 0000 0000: -
 */
 
-struct Mapping46 {
+struct MappingGoPPC64_46 {
   static const uptr kMetaShadowBeg = 0x240000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
   static const uptr kTraceMemBeg   = 0x360000000000ull;
   static const uptr kTraceMemEnd   = 0x380000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x238000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
 };
 
 /* Go on linux/powerpc64 (47-bit VMA)
@@ -491,21 +505,27 @@ struct Mapping46 {
 6200 0000 0000 - 8000 0000 0000: -
 */
 
-struct Mapping47 {
+struct MappingGoPPC64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
   static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x300000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
 };
 
-#define TSAN_RUNTIME_VMA 1
-
-#elif SANITIZER_GO && defined(__aarch64__)
-
 /* Go on linux/aarch64 (48-bit VMA) and darwin/aarch64 (47-bit VMA)
 0000 0000 1000 - 0000 1000 0000: executable
 0000 1000 0000 - 00c0 0000 0000: -
@@ -518,22 +538,27 @@ struct Mapping47 {
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 8000 0000 0000: -
 */
-
-struct Mapping {
+struct MappingGoAarch64 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
   static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x300000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
 };
 
-// Indicates the runtime will define the memory regions at runtime.
-#define TSAN_RUNTIME_VMA 1
-
-#elif SANITIZER_GO && defined(__mips64)
 /*
 Go on linux/mips64 (47-bit VMA)
 0000 0000 1000 - 0000 1000 0000: executable
@@ -547,20 +572,27 @@ Go on linux/mips64 (47-bit VMA)
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 8000 0000 0000: -
 */
-struct Mapping47 {
+struct MappingGoMips64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
   static const uptr kTraceMemBeg = 0x600000000000ull;
   static const uptr kTraceMemEnd = 0x620000000000ull;
   static const uptr kShadowBeg = 0x200000000000ull;
   static const uptr kShadowEnd = 0x300000000000ull;
-  static const uptr kAppMemBeg = 0x000000001000ull;
-  static const uptr kAppMemEnd = 0x00e000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
 };
 
-#define TSAN_RUNTIME_VMA 1
-
-#elif SANITIZER_GO && defined(__s390x__)
 /*
 Go on linux/s390x
 0000 0000 1000 - 1000 0000 0000: executable and heap - 16 TiB
@@ -571,622 +603,367 @@ Go on linux/s390x
 9800 0000 0000 - a000 0000 0000: -
 a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
 */
-struct Mapping {
+struct MappingGoS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
   static const uptr kTraceMemBeg   = 0xa00000000000ull;
   static const uptr kTraceMemEnd   = 0xb00000000000ull;
   static const uptr kShadowBeg     = 0x400000000000ull;
   static const uptr kShadowEnd     = 0x800000000000ull;
-  static const uptr kAppMemBeg     = 0x000000001000ull;
-  static const uptr kAppMemEnd     = 0x100000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x100000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x400000000000ull;
 };
 
-#else
-# error "Unknown platform"
-#endif
-
-
-#ifdef TSAN_RUNTIME_VMA
 extern uptr vmaSize;
-#endif
-
-
-enum MappingType {
-  MAPPING_LO_APP_BEG,
-  MAPPING_LO_APP_END,
-  MAPPING_HI_APP_BEG,
-  MAPPING_HI_APP_END,
-#ifdef TSAN_MID_APP_RANGE
-  MAPPING_MID_APP_BEG,
-  MAPPING_MID_APP_END,
-#endif
-  MAPPING_HEAP_BEG,
-  MAPPING_HEAP_END,
-  MAPPING_APP_BEG,
-  MAPPING_APP_END,
-  MAPPING_SHADOW_BEG,
-  MAPPING_SHADOW_END,
-  MAPPING_META_SHADOW_BEG,
-  MAPPING_META_SHADOW_END,
-  MAPPING_TRACE_BEG,
-  MAPPING_TRACE_END,
-  MAPPING_VDSO_BEG,
-};
-
-template<typename Mapping, int Type>
-uptr MappingImpl(void) {
-  switch (Type) {
-#if !SANITIZER_GO
-    case MAPPING_LO_APP_BEG: return Mapping::kLoAppMemBeg;
-    case MAPPING_LO_APP_END: return Mapping::kLoAppMemEnd;
-# ifdef TSAN_MID_APP_RANGE
-    case MAPPING_MID_APP_BEG: return Mapping::kMidAppMemBeg;
-    case MAPPING_MID_APP_END: return Mapping::kMidAppMemEnd;
-# endif
-    case MAPPING_HI_APP_BEG: return Mapping::kHiAppMemBeg;
-    case MAPPING_HI_APP_END: return Mapping::kHiAppMemEnd;
-    case MAPPING_HEAP_BEG: return Mapping::kHeapMemBeg;
-    case MAPPING_HEAP_END: return Mapping::kHeapMemEnd;
-    case MAPPING_VDSO_BEG: return Mapping::kVdsoBeg;
-#else
-    case MAPPING_APP_BEG: return Mapping::kAppMemBeg;
-    case MAPPING_APP_END: return Mapping::kAppMemEnd;
-#endif
-    case MAPPING_SHADOW_BEG: return Mapping::kShadowBeg;
-    case MAPPING_SHADOW_END: return Mapping::kShadowEnd;
-    case MAPPING_META_SHADOW_BEG: return Mapping::kMetaShadowBeg;
-    case MAPPING_META_SHADOW_END: return Mapping::kMetaShadowEnd;
-    case MAPPING_TRACE_BEG: return Mapping::kTraceMemBeg;
-    case MAPPING_TRACE_END: return Mapping::kTraceMemEnd;
-  }
-}
 
-template<int Type>
-uptr MappingArchImpl(void) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
+template <typename Func, typename Arg>
+ALWAYS_INLINE auto SelectMapping(Arg arg) {
+#if SANITIZER_GO
+#  if defined(__powerpc64__)
   switch (vmaSize) {
-    case 39: return MappingImpl<Mapping39, Type>();
-    case 42: return MappingImpl<Mapping42, Type>();
-    case 48: return MappingImpl<Mapping48, Type>();
+    case 46:
+      return Func::template Apply<MappingGoPPC64_46>(arg);
+    case 47:
+      return Func::template Apply<MappingGoPPC64_47>(arg);
   }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
+#  elif defined(__mips64)
+  return Func::template Apply<MappingGoMips64_47>(arg);
+#  elif defined(__s390x__)
+  return Func::template Apply<MappingGoS390x>(arg);
+#  elif defined(__aarch64__)
+  return Func::template Apply<MappingGoAarch64>(arg);
+#  elif SANITIZER_WINDOWS
+  return Func::template Apply<MappingGoWindows>(arg);
+#  else
+  return Func::template Apply<MappingGo48>(arg);
+#  endif
+#else  // SANITIZER_GO
+#  if defined(__x86_64__) || SANITIZER_IOSSIM || SANITIZER_MAC && !SANITIZER_IOS
+  return Func::template Apply<Mapping48AddressSpace>(arg);
+#  elif defined(__aarch64__) && defined(__APPLE__)
+  return Func::template Apply<MappingAppleAarch64>(arg);
+#  elif defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return MappingImpl<Mapping44, Type>();
-#endif
-    case 46: return MappingImpl<Mapping46, Type>();
-    case 47: return MappingImpl<Mapping47, Type>();
+    case 39:
+      return Func::template Apply<MappingAarch64_39>(arg);
+    case 42:
+      return Func::template Apply<MappingAarch64_42>(arg);
+    case 48:
+      return Func::template Apply<MappingAarch64_48>(arg);
   }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
+#  elif defined(__powerpc64__)
   switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return MappingImpl<Mapping40, Type>();
-#else
-    case 47: return MappingImpl<Mapping47, Type>();
-#endif
+    case 44:
+      return Func::template Apply<MappingPPC64_44>(arg);
+    case 46:
+      return Func::template Apply<MappingPPC64_46>(arg);
+    case 47:
+      return Func::template Apply<MappingPPC64_47>(arg);
   }
-  DCHECK(0);
-  return 0;
-#else
-  return MappingImpl<Mapping, Type>();
-#endif
+#  elif defined(__mips64)
+  return Func::template Apply<MappingMips64_40>(arg);
+#  elif defined(__s390x__)
+  return Func::template Apply<MappingS390x>(arg);
+#  else
+#    error "unsupported platform"
+#  endif
+#endif
+  Die();
+}
+
+template <typename Func>
+void ForEachMapping() {
+  Func::template Apply<Mapping48AddressSpace>();
+  Func::template Apply<MappingMips64_40>();
+  Func::template Apply<MappingAppleAarch64>();
+  Func::template Apply<MappingAarch64_39>();
+  Func::template Apply<MappingAarch64_42>();
+  Func::template Apply<MappingAarch64_48>();
+  Func::template Apply<MappingPPC64_44>();
+  Func::template Apply<MappingPPC64_46>();
+  Func::template Apply<MappingPPC64_47>();
+  Func::template Apply<MappingS390x>();
+  Func::template Apply<MappingGo48>();
+  Func::template Apply<MappingGoWindows>();
+  Func::template Apply<MappingGoPPC64_46>();
+  Func::template Apply<MappingGoPPC64_47>();
+  Func::template Apply<MappingGoAarch64>();
+  Func::template Apply<MappingGoMips64_47>();
+  Func::template Apply<MappingGoS390x>();
 }
 
-#if !SANITIZER_GO
-ALWAYS_INLINE
-uptr LoAppMemBeg(void) {
-  return MappingArchImpl<MAPPING_LO_APP_BEG>();
-}
-ALWAYS_INLINE
-uptr LoAppMemEnd(void) {
-  return MappingArchImpl<MAPPING_LO_APP_END>();
-}
+enum MappingType {
+  kLoAppMemBeg,
+  kLoAppMemEnd,
+  kHiAppMemBeg,
+  kHiAppMemEnd,
+  kMidAppMemBeg,
+  kMidAppMemEnd,
+  kHeapMemBeg,
+  kHeapMemEnd,
+  kShadowBeg,
+  kShadowEnd,
+  kMetaShadowBeg,
+  kMetaShadowEnd,
+  kTraceMemBeg,
+  kTraceMemEnd,
+  kVdsoBeg,
+};
 
-#ifdef TSAN_MID_APP_RANGE
-ALWAYS_INLINE
-uptr MidAppMemBeg(void) {
-  return MappingArchImpl<MAPPING_MID_APP_BEG>();
-}
-ALWAYS_INLINE
-uptr MidAppMemEnd(void) {
-  return MappingArchImpl<MAPPING_MID_APP_END>();
-}
-#endif
+struct MappingField {
+  template <typename Mapping>
+  static uptr Apply(MappingType type) {
+    switch (type) {
+      case kLoAppMemBeg:
+        return Mapping::kLoAppMemBeg;
+      case kLoAppMemEnd:
+        return Mapping::kLoAppMemEnd;
+      case kMidAppMemBeg:
+        return Mapping::kMidAppMemBeg;
+      case kMidAppMemEnd:
+        return Mapping::kMidAppMemEnd;
+      case kHiAppMemBeg:
+        return Mapping::kHiAppMemBeg;
+      case kHiAppMemEnd:
+        return Mapping::kHiAppMemEnd;
+      case kHeapMemBeg:
+        return Mapping::kHeapMemBeg;
+      case kHeapMemEnd:
+        return Mapping::kHeapMemEnd;
+      case kVdsoBeg:
+        return Mapping::kVdsoBeg;
+      case kShadowBeg:
+        return Mapping::kShadowBeg;
+      case kShadowEnd:
+        return Mapping::kShadowEnd;
+      case kMetaShadowBeg:
+        return Mapping::kMetaShadowBeg;
+      case kMetaShadowEnd:
+        return Mapping::kMetaShadowEnd;
+      case kTraceMemBeg:
+        return Mapping::kTraceMemBeg;
+      case kTraceMemEnd:
+        return Mapping::kTraceMemEnd;
+    }
+    Die();
+  }
+};
 
 ALWAYS_INLINE
-uptr HeapMemBeg(void) {
-  return MappingArchImpl<MAPPING_HEAP_BEG>();
-}
+uptr LoAppMemBeg(void) { return SelectMapping<MappingField>(kLoAppMemBeg); }
 ALWAYS_INLINE
-uptr HeapMemEnd(void) {
-  return MappingArchImpl<MAPPING_HEAP_END>();
-}
+uptr LoAppMemEnd(void) { return SelectMapping<MappingField>(kLoAppMemEnd); }
 
 ALWAYS_INLINE
-uptr HiAppMemBeg(void) {
-  return MappingArchImpl<MAPPING_HI_APP_BEG>();
-}
+uptr MidAppMemBeg(void) { return SelectMapping<MappingField>(kMidAppMemBeg); }
 ALWAYS_INLINE
-uptr HiAppMemEnd(void) {
-  return MappingArchImpl<MAPPING_HI_APP_END>();
-}
+uptr MidAppMemEnd(void) { return SelectMapping<MappingField>(kMidAppMemEnd); }
 
 ALWAYS_INLINE
-uptr VdsoBeg(void) {
-  return MappingArchImpl<MAPPING_VDSO_BEG>();
-}
-
-#else
+uptr HeapMemBeg(void) { return SelectMapping<MappingField>(kHeapMemBeg); }
+ALWAYS_INLINE
+uptr HeapMemEnd(void) { return SelectMapping<MappingField>(kHeapMemEnd); }
 
 ALWAYS_INLINE
-uptr AppMemBeg(void) {
-  return MappingArchImpl<MAPPING_APP_BEG>();
-}
+uptr HiAppMemBeg(void) { return SelectMapping<MappingField>(kHiAppMemBeg); }
 ALWAYS_INLINE
-uptr AppMemEnd(void) {
-  return MappingArchImpl<MAPPING_APP_END>();
-}
-
-#endif
+uptr HiAppMemEnd(void) { return SelectMapping<MappingField>(kHiAppMemEnd); }
 
-static inline
-bool GetUserRegion(int i, uptr *start, uptr *end) {
-  switch (i) {
-  default:
-    return false;
-#if !SANITIZER_GO
-  case 0:
-    *start = LoAppMemBeg();
-    *end = LoAppMemEnd();
-    return true;
-  case 1:
-    *start = HiAppMemBeg();
-    *end = HiAppMemEnd();
-    return true;
-  case 2:
-    *start = HeapMemBeg();
-    *end = HeapMemEnd();
-    return true;
-# ifdef TSAN_MID_APP_RANGE
-  case 3:
-    *start = MidAppMemBeg();
-    *end = MidAppMemEnd();
-    return true;
-# endif
-#else
-  case 0:
-    *start = AppMemBeg();
-    *end = AppMemEnd();
-    return true;
-#endif
-  }
-}
+ALWAYS_INLINE
+uptr VdsoBeg(void) { return SelectMapping<MappingField>(kVdsoBeg); }
 
 ALWAYS_INLINE
-uptr ShadowBeg(void) {
-  return MappingArchImpl<MAPPING_SHADOW_BEG>();
-}
+uptr ShadowBeg(void) { return SelectMapping<MappingField>(kShadowBeg); }
 ALWAYS_INLINE
-uptr ShadowEnd(void) {
-  return MappingArchImpl<MAPPING_SHADOW_END>();
-}
+uptr ShadowEnd(void) { return SelectMapping<MappingField>(kShadowEnd); }
 
 ALWAYS_INLINE
-uptr MetaShadowBeg(void) {
-  return MappingArchImpl<MAPPING_META_SHADOW_BEG>();
-}
+uptr MetaShadowBeg(void) { return SelectMapping<MappingField>(kMetaShadowBeg); }
 ALWAYS_INLINE
-uptr MetaShadowEnd(void) {
-  return MappingArchImpl<MAPPING_META_SHADOW_END>();
-}
+uptr MetaShadowEnd(void) { return SelectMapping<MappingField>(kMetaShadowEnd); }
 
 ALWAYS_INLINE
-uptr TraceMemBeg(void) {
-  return MappingArchImpl<MAPPING_TRACE_BEG>();
-}
+uptr TraceMemBeg(void) { return SelectMapping<MappingField>(kTraceMemBeg); }
 ALWAYS_INLINE
-uptr TraceMemEnd(void) {
-  return MappingArchImpl<MAPPING_TRACE_END>();
-}
-
+uptr TraceMemEnd(void) { return SelectMapping<MappingField>(kTraceMemEnd); }
 
-template<typename Mapping>
-bool IsAppMemImpl(uptr mem) {
-#if !SANITIZER_GO
+struct IsAppMemImpl {
+  template <typename Mapping>
+  static bool Apply(uptr mem) {
   return (mem >= Mapping::kHeapMemBeg && mem < Mapping::kHeapMemEnd) ||
-# ifdef TSAN_MID_APP_RANGE
          (mem >= Mapping::kMidAppMemBeg && mem < Mapping::kMidAppMemEnd) ||
-# endif
          (mem >= Mapping::kLoAppMemBeg && mem < Mapping::kLoAppMemEnd) ||
          (mem >= Mapping::kHiAppMemBeg && mem < Mapping::kHiAppMemEnd);
-#else
-  return mem >= Mapping::kAppMemBeg && mem < Mapping::kAppMemEnd;
-#endif
-}
-
-ALWAYS_INLINE
-bool IsAppMem(uptr mem) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return IsAppMemImpl<Mapping39>(mem);
-    case 42: return IsAppMemImpl<Mapping42>(mem);
-    case 48: return IsAppMemImpl<Mapping48>(mem);
-  }
-  DCHECK(0);
-  return false;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return IsAppMemImpl<Mapping44>(mem);
-#endif
-    case 46: return IsAppMemImpl<Mapping46>(mem);
-    case 47: return IsAppMemImpl<Mapping47>(mem);
   }
-  DCHECK(0);
-  return false;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return IsAppMemImpl<Mapping40>(mem);
-#else
-    case 47: return IsAppMemImpl<Mapping47>(mem);
-#endif
-  }
-  DCHECK(0);
-  return false;
-#else
-  return IsAppMemImpl<Mapping>(mem);
-#endif
-}
+};
 
+ALWAYS_INLINE
+bool IsAppMem(uptr mem) { return SelectMapping<IsAppMemImpl>(mem); }
 
-template<typename Mapping>
-bool IsShadowMemImpl(uptr mem) {
-  return mem >= Mapping::kShadowBeg && mem <= Mapping::kShadowEnd;
-}
+struct IsShadowMemImpl {
+  template <typename Mapping>
+  static bool Apply(uptr mem) {
+    return mem >= Mapping::kShadowBeg && mem <= Mapping::kShadowEnd;
+  }
+};
 
 ALWAYS_INLINE
-bool IsShadowMem(uptr mem) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return IsShadowMemImpl<Mapping39>(mem);
-    case 42: return IsShadowMemImpl<Mapping42>(mem);
-    case 48: return IsShadowMemImpl<Mapping48>(mem);
-  }
-  DCHECK(0);
-  return false;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return IsShadowMemImpl<Mapping44>(mem);
-#endif
-    case 46: return IsShadowMemImpl<Mapping46>(mem);
-    case 47: return IsShadowMemImpl<Mapping47>(mem);
-  }
-  DCHECK(0);
-  return false;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return IsShadowMemImpl<Mapping40>(mem);
-#else
-    case 47: return IsShadowMemImpl<Mapping47>(mem);
-#endif
-  }
-  DCHECK(0);
-  return false;
-#else
-  return IsShadowMemImpl<Mapping>(mem);
-#endif
+bool IsShadowMem(RawShadow *p) {
+  return SelectMapping<IsShadowMemImpl>(reinterpret_cast<uptr>(p));
 }
 
-
-template<typename Mapping>
-bool IsMetaMemImpl(uptr mem) {
-  return mem >= Mapping::kMetaShadowBeg && mem <= Mapping::kMetaShadowEnd;
-}
+struct IsMetaMemImpl {
+  template <typename Mapping>
+  static bool Apply(uptr mem) {
+    return mem >= Mapping::kMetaShadowBeg && mem <= Mapping::kMetaShadowEnd;
+  }
+};
 
 ALWAYS_INLINE
-bool IsMetaMem(uptr mem) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return IsMetaMemImpl<Mapping39>(mem);
-    case 42: return IsMetaMemImpl<Mapping42>(mem);
-    case 48: return IsMetaMemImpl<Mapping48>(mem);
-  }
-  DCHECK(0);
-  return false;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return IsMetaMemImpl<Mapping44>(mem);
-#endif
-    case 46: return IsMetaMemImpl<Mapping46>(mem);
-    case 47: return IsMetaMemImpl<Mapping47>(mem);
+bool IsMetaMem(const u32 *p) {
+  return SelectMapping<IsMetaMemImpl>(reinterpret_cast<uptr>(p));
+}
+
+struct MemToShadowImpl {
+  template <typename Mapping>
+  static uptr Apply(uptr x) {
+    DCHECK(IsAppMemImpl::Apply<Mapping>(x));
+    return (((x) & ~(Mapping::kShadowMsk | (kShadowCell - 1))) ^
+            Mapping::kShadowXor) *
+               kShadowMultiplier +
+           Mapping::kShadowAdd;
   }
-  DCHECK(0);
-  return false;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return IsMetaMemImpl<Mapping40>(mem);
-#else
-    case 47: return IsMetaMemImpl<Mapping47>(mem);
-#endif
-  }
-  DCHECK(0);
-  return false;
-#else
-  return IsMetaMemImpl<Mapping>(mem);
-#endif
-}
-
-
-template<typename Mapping>
-uptr MemToShadowImpl(uptr x) {
-  DCHECK(IsAppMem(x));
-#if !SANITIZER_GO
-  return (((x) & ~(Mapping::kAppMemMsk | (kShadowCell - 1)))
-      ^ Mapping::kAppMemXor) * kShadowCnt;
-#else
-# ifndef SANITIZER_WINDOWS
-  return ((x & ~(kShadowCell - 1)) * kShadowCnt) | Mapping::kShadowBeg;
-# else
-  return ((x & ~(kShadowCell - 1)) * kShadowCnt) + Mapping::kShadowBeg;
-# endif
-#endif
-}
+};
 
 ALWAYS_INLINE
-uptr MemToShadow(uptr x) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return MemToShadowImpl<Mapping39>(x);
-    case 42: return MemToShadowImpl<Mapping42>(x);
-    case 48: return MemToShadowImpl<Mapping48>(x);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return MemToShadowImpl<Mapping44>(x);
-#endif
-    case 46: return MemToShadowImpl<Mapping46>(x);
-    case 47: return MemToShadowImpl<Mapping47>(x);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return MemToShadowImpl<Mapping40>(x);
-#else
-    case 47: return MemToShadowImpl<Mapping47>(x);
-#endif
-  }
-  DCHECK(0);
-  return 0;
-#else
-  return MemToShadowImpl<Mapping>(x);
-#endif
+RawShadow *MemToShadow(uptr x) {
+  return reinterpret_cast<RawShadow *>(SelectMapping<MemToShadowImpl>(x));
 }
 
-
-template<typename Mapping>
-u32 *MemToMetaImpl(uptr x) {
-  DCHECK(IsAppMem(x));
-#if !SANITIZER_GO
-  return (u32*)(((((x) & ~(Mapping::kAppMemMsk | (kMetaShadowCell - 1)))) /
-      kMetaShadowCell * kMetaShadowSize) | Mapping::kMetaShadowBeg);
-#else
-# ifndef SANITIZER_WINDOWS
-  return (u32*)(((x & ~(kMetaShadowCell - 1)) / \
-      kMetaShadowCell * kMetaShadowSize) | Mapping::kMetaShadowBeg);
-# else
-  return (u32*)(((x & ~(kMetaShadowCell - 1)) / \
-      kMetaShadowCell * kMetaShadowSize) + Mapping::kMetaShadowBeg);
-# endif
-#endif
-}
+struct MemToMetaImpl {
+  template <typename Mapping>
+  static u32 *Apply(uptr x) {
+    DCHECK(IsAppMemImpl::Apply<Mapping>(x));
+    return (u32 *)(((((x) & ~(Mapping::kShadowMsk | (kMetaShadowCell - 1)))) /
+                    kMetaShadowCell * kMetaShadowSize) |
+                   Mapping::kMetaShadowBeg);
+  }
+};
 
 ALWAYS_INLINE
-u32 *MemToMeta(uptr x) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return MemToMetaImpl<Mapping39>(x);
-    case 42: return MemToMetaImpl<Mapping42>(x);
-    case 48: return MemToMetaImpl<Mapping48>(x);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return MemToMetaImpl<Mapping44>(x);
-#endif
-    case 46: return MemToMetaImpl<Mapping46>(x);
-    case 47: return MemToMetaImpl<Mapping47>(x);
+u32 *MemToMeta(uptr x) { return SelectMapping<MemToMetaImpl>(x); }
+
+struct ShadowToMemImpl {
+  template <typename Mapping>
+  static uptr Apply(uptr sp) {
+    if (!IsShadowMemImpl::Apply<Mapping>(sp))
+      return 0;
+    // The shadow mapping is non-linear and we've lost some bits, so we don't
+    // have an easy way to restore the original app address. But the mapping is
+    // a bijection, so we try to restore the address as belonging to
+    // low/mid/high range consecutively and see if shadow->app->shadow mapping
+    // gives us the same address.
+    uptr p =
+        ((sp - Mapping::kShadowAdd) / kShadowMultiplier) ^ Mapping::kShadowXor;
+    if (p >= Mapping::kLoAppMemBeg && p < Mapping::kLoAppMemEnd &&
+        MemToShadowImpl::Apply<Mapping>(p) == sp)
+      return p;
+    if (Mapping::kMidAppMemBeg) {
+      uptr p_mid = p + (Mapping::kMidAppMemBeg & Mapping::kShadowMsk);
+      if (p_mid >= Mapping::kMidAppMemBeg && p_mid < Mapping::kMidAppMemEnd &&
+          MemToShadowImpl::Apply<Mapping>(p_mid) == sp)
+        return p_mid;
+    }
+    return p | Mapping::kShadowMsk;
   }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return MemToMetaImpl<Mapping40>(x);
-#else
-    case 47: return MemToMetaImpl<Mapping47>(x);
-#endif
-  }
-  DCHECK(0);
-  return 0;
-#else
-  return MemToMetaImpl<Mapping>(x);
-#endif
-}
-
-
-template<typename Mapping>
-uptr ShadowToMemImpl(uptr s) {
-  DCHECK(IsShadowMem(s));
-#if !SANITIZER_GO
-  // The shadow mapping is non-linear and we've lost some bits, so we don't have
-  // an easy way to restore the original app address. But the mapping is a
-  // bijection, so we try to restore the address as belonging to low/mid/high
-  // range consecutively and see if shadow->app->shadow mapping gives us the
-  // same address.
-  uptr p = (s / kShadowCnt) ^ Mapping::kAppMemXor;
-  if (p >= Mapping::kLoAppMemBeg && p < Mapping::kLoAppMemEnd &&
-      MemToShadow(p) == s)
-    return p;
-# ifdef TSAN_MID_APP_RANGE
-  p = ((s / kShadowCnt) ^ Mapping::kAppMemXor) +
-      (Mapping::kMidAppMemBeg & Mapping::kAppMemMsk);
-  if (p >= Mapping::kMidAppMemBeg && p < Mapping::kMidAppMemEnd &&
-      MemToShadow(p) == s)
-    return p;
-# endif
-  return ((s / kShadowCnt) ^ Mapping::kAppMemXor) | Mapping::kAppMemMsk;
-#else  // #if !SANITIZER_GO
-# ifndef SANITIZER_WINDOWS
-  return (s & ~Mapping::kShadowBeg) / kShadowCnt;
-# else
-  return (s - Mapping::kShadowBeg) / kShadowCnt;
-# endif // SANITIZER_WINDOWS
-#endif
-}
+};
 
 ALWAYS_INLINE
-uptr ShadowToMem(uptr s) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return ShadowToMemImpl<Mapping39>(s);
-    case 42: return ShadowToMemImpl<Mapping42>(s);
-    case 48: return ShadowToMemImpl<Mapping48>(s);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return ShadowToMemImpl<Mapping44>(s);
-#endif
-    case 46: return ShadowToMemImpl<Mapping46>(s);
-    case 47: return ShadowToMemImpl<Mapping47>(s);
+uptr ShadowToMem(RawShadow *s) {
+  return SelectMapping<ShadowToMemImpl>(reinterpret_cast<uptr>(s));
+}
+
+// Compresses addr to kCompressedAddrBits stored in least significant bits.
+ALWAYS_INLINE uptr CompressAddr(uptr addr) {
+  return addr & ((1ull << kCompressedAddrBits) - 1);
+}
+
+struct RestoreAddrImpl {
+  typedef uptr Result;
+  template <typename Mapping>
+  static Result Apply(uptr addr) {
+    // To restore the address we go over all app memory ranges and check if top
+    // 3 bits of the compressed addr match that of the app range. If yes, we
+    // assume that the compressed address come from that range and restore the
+    // missing top bits to match the app range address.
+    static constexpr uptr ranges[] = {
+        Mapping::kLoAppMemBeg,  Mapping::kLoAppMemEnd, Mapping::kMidAppMemBeg,
+        Mapping::kMidAppMemEnd, Mapping::kHiAppMemBeg, Mapping::kHiAppMemEnd,
+        Mapping::kHeapMemBeg,   Mapping::kHeapMemEnd,
+    };
+    const uptr indicator = 0x0e0000000000ull;
+    const uptr ind_lsb = 1ull << LeastSignificantSetBitIndex(indicator);
+    for (uptr i = 0; i < ARRAY_SIZE(ranges); i += 2) {
+      uptr beg = ranges[i];
+      uptr end = ranges[i + 1];
+      if (beg == end)
+        continue;
+      for (uptr p = beg; p < end; p = RoundDown(p + ind_lsb, ind_lsb)) {
+        if ((addr & indicator) == (p & indicator))
+          return addr | (p & ~(ind_lsb - 1));
+      }
+    }
+    Printf("ThreadSanitizer: failed to restore address 0x%zx\n", addr);
+    Die();
   }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return ShadowToMemImpl<Mapping40>(s);
-#else
-    case 47: return ShadowToMemImpl<Mapping47>(s);
-#endif
-  }
-  DCHECK(0);
-  return 0;
-#else
-  return ShadowToMemImpl<Mapping>(s);
-#endif
-}
-
+};
 
+// Restores compressed addr from kCompressedAddrBits to full representation.
+// This is called only during reporting and is not performance-critical.
+inline uptr RestoreAddr(uptr addr) {
+  return SelectMapping<RestoreAddrImpl>(addr);
+}
 
 // The additional page is to catch shadow stack overflow as paging fault.
 // Windows wants 64K alignment for mmaps.
 const uptr kTotalTraceSize = (kTraceSize * sizeof(Event) + sizeof(Trace)
     + (64 << 10) + (64 << 10) - 1) & ~((64 << 10) - 1);
 
-template<typename Mapping>
-uptr GetThreadTraceImpl(int tid) {
-  uptr p = Mapping::kTraceMemBeg + (uptr)tid * kTotalTraceSize;
-  DCHECK_LT(p, Mapping::kTraceMemEnd);
-  return p;
-}
+struct GetThreadTraceImpl {
+  template <typename Mapping>
+  static uptr Apply(uptr tid) {
+    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize;
+    DCHECK_LT(p, Mapping::kTraceMemEnd);
+    return p;
+  }
+};
 
 ALWAYS_INLINE
-uptr GetThreadTrace(int tid) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return GetThreadTraceImpl<Mapping39>(tid);
-    case 42: return GetThreadTraceImpl<Mapping42>(tid);
-    case 48: return GetThreadTraceImpl<Mapping48>(tid);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return GetThreadTraceImpl<Mapping44>(tid);
-#endif
-    case 46: return GetThreadTraceImpl<Mapping46>(tid);
-    case 47: return GetThreadTraceImpl<Mapping47>(tid);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return GetThreadTraceImpl<Mapping40>(tid);
-#else
-    case 47: return GetThreadTraceImpl<Mapping47>(tid);
-#endif
+uptr GetThreadTrace(int tid) { return SelectMapping<GetThreadTraceImpl>(tid); }
+
+struct GetThreadTraceHeaderImpl {
+  template <typename Mapping>
+  static uptr Apply(uptr tid) {
+    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize +
+             kTraceSize * sizeof(Event);
+    DCHECK_LT(p, Mapping::kTraceMemEnd);
+    return p;
   }
-  DCHECK(0);
-  return 0;
-#else
-  return GetThreadTraceImpl<Mapping>(tid);
-#endif
-}
-
-
-template<typename Mapping>
-uptr GetThreadTraceHeaderImpl(int tid) {
-  uptr p = Mapping::kTraceMemBeg + (uptr)tid * kTotalTraceSize
-      + kTraceSize * sizeof(Event);
-  DCHECK_LT(p, Mapping::kTraceMemEnd);
-  return p;
-}
+};
 
 ALWAYS_INLINE
 uptr GetThreadTraceHeader(int tid) {
-#if defined(__aarch64__) && !defined(__APPLE__) && !SANITIZER_GO
-  switch (vmaSize) {
-    case 39: return GetThreadTraceHeaderImpl<Mapping39>(tid);
-    case 42: return GetThreadTraceHeaderImpl<Mapping42>(tid);
-    case 48: return GetThreadTraceHeaderImpl<Mapping48>(tid);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__powerpc64__)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 44: return GetThreadTraceHeaderImpl<Mapping44>(tid);
-#endif
-    case 46: return GetThreadTraceHeaderImpl<Mapping46>(tid);
-    case 47: return GetThreadTraceHeaderImpl<Mapping47>(tid);
-  }
-  DCHECK(0);
-  return 0;
-#elif defined(__mips64)
-  switch (vmaSize) {
-#if !SANITIZER_GO
-    case 40: return GetThreadTraceHeaderImpl<Mapping40>(tid);
-#else
-    case 47: return GetThreadTraceHeaderImpl<Mapping47>(tid);
-#endif
-  }
-  DCHECK(0);
-  return 0;
-#else
-  return GetThreadTraceHeaderImpl<Mapping>(tid);
-#endif
+  return SelectMapping<GetThreadTraceHeaderImpl>(tid);
 }
 
 void InitializePlatform();
@@ -1194,7 +971,7 @@ void InitializePlatformEarly();
 void CheckAndProtect();
 void InitializeShadowMemoryPlatform();
 void FlushShadowMemory();
-void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
+void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
 uptr ExtractLongJmpSp(uptr *env);
diff --git a/libsanitizer/tsan/tsan_platform_linux.cpp b/libsanitizer/tsan/tsan_platform_linux.cpp
index cfe597e..6134a1b 100644
--- a/libsanitizer/tsan/tsan_platform_linux.cpp
+++ b/libsanitizer/tsan/tsan_platform_linux.cpp
@@ -85,21 +85,19 @@ static void InitializeLongjmpXorKey();
 static uptr longjmp_xor_key;
 #endif
 
-#ifdef TSAN_RUNTIME_VMA
 // Runtime detected VMA size.
 uptr vmaSize;
-#endif
 
 enum {
-  MemTotal  = 0,
-  MemShadow = 1,
-  MemMeta   = 2,
-  MemFile   = 3,
-  MemMmap   = 4,
-  MemTrace  = 5,
-  MemHeap   = 6,
-  MemOther  = 7,
-  MemCount  = 8,
+  MemTotal,
+  MemShadow,
+  MemMeta,
+  MemFile,
+  MemMmap,
+  MemTrace,
+  MemHeap,
+  MemOther,
+  MemCount,
 };
 
 void FillProfileCallback(uptr p, uptr rss, bool file,
@@ -109,39 +107,47 @@ void FillProfileCallback(uptr p, uptr rss, bool file,
     mem[MemShadow] += rss;
   else if (p >= MetaShadowBeg() && p < MetaShadowEnd())
     mem[MemMeta] += rss;
-#if !SANITIZER_GO
+  else if ((p >= LoAppMemBeg() && p < LoAppMemEnd()) ||
+           (p >= MidAppMemBeg() && p < MidAppMemEnd()) ||
+           (p >= HiAppMemBeg() && p < HiAppMemEnd()))
+    mem[file ? MemFile : MemMmap] += rss;
   else if (p >= HeapMemBeg() && p < HeapMemEnd())
     mem[MemHeap] += rss;
-  else if (p >= LoAppMemBeg() && p < LoAppMemEnd())
-    mem[file ? MemFile : MemMmap] += rss;
-  else if (p >= HiAppMemBeg() && p < HiAppMemEnd())
-    mem[file ? MemFile : MemMmap] += rss;
-#else
-  else if (p >= AppMemBeg() && p < AppMemEnd())
-    mem[file ? MemFile : MemMmap] += rss;
-#endif
   else if (p >= TraceMemBeg() && p < TraceMemEnd())
     mem[MemTrace] += rss;
   else
     mem[MemOther] += rss;
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
+void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   uptr mem[MemCount];
-  internal_memset(mem, 0, sizeof(mem[0]) * MemCount);
-  __sanitizer::GetMemoryProfile(FillProfileCallback, mem, 7);
+  internal_memset(mem, 0, sizeof(mem));
+  GetMemoryProfile(FillProfileCallback, mem, MemCount);
+  auto meta = ctx->metamap.GetMemoryStats();
   StackDepotStats *stacks = StackDepotGetStats();
-  internal_snprintf(buf, buf_size,
-      "RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
-      " trace:%zd heap:%zd other:%zd stacks=%zd[%zd] nthr=%zd/%zd\n",
-      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
-      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemTrace] >> 20,
-      mem[MemHeap] >> 20, mem[MemOther] >> 20,
-      stacks->allocated >> 20, stacks->n_uniq_ids,
-      nlive, nthread);
+  uptr nthread, nlive;
+  ctx->thread_registry.GetNumberOfThreads(&nthread, &nlive);
+  uptr internal_stats[AllocatorStatCount];
+  internal_allocator()->GetStats(internal_stats);
+  // All these are allocated from the common mmap region.
+  mem[MemMmap] -= meta.mem_block + meta.sync_obj + stacks->allocated +
+                  internal_stats[AllocatorStatMapped];
+  if (s64(mem[MemMmap]) < 0)
+    mem[MemMmap] = 0;
+  internal_snprintf(
+      buf, buf_size,
+      "%llus: RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
+      " trace:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
+      " stacks=%zd[%zd] nthr=%zd/%zd\n",
+      uptime_ns / (1000 * 1000 * 1000), mem[MemTotal] >> 20,
+      mem[MemShadow] >> 20, mem[MemMeta] >> 20, mem[MemFile] >> 20,
+      mem[MemMmap] >> 20, mem[MemTrace] >> 20, mem[MemHeap] >> 20,
+      mem[MemOther] >> 20, internal_stats[AllocatorStatMapped] >> 20,
+      meta.mem_block >> 20, meta.sync_obj >> 20, stacks->allocated >> 20,
+      stacks->n_uniq_ids, nlive, nthread);
 }
 
-#if SANITIZER_LINUX
+#  if SANITIZER_LINUX
 void FlushShadowMemoryCallback(
     const SuspendedThreadsList &suspended_threads_list,
     void *argument) {
@@ -178,12 +184,13 @@ static void MapRodata() {
   internal_unlink(name);  // Unlink it now, so that we can reuse the buffer.
   fd_t fd = openrv;
   // Fill the file with kShadowRodata.
-  const uptr kMarkerSize = 512 * 1024 / sizeof(u64);
-  InternalMmapVector<u64> marker(kMarkerSize);
+  const uptr kMarkerSize = 512 * 1024 / sizeof(RawShadow);
+  InternalMmapVector<RawShadow> marker(kMarkerSize);
   // volatile to prevent insertion of memset
-  for (volatile u64 *p = marker.data(); p < marker.data() + kMarkerSize; p++)
+  for (volatile RawShadow *p = marker.data(); p < marker.data() + kMarkerSize;
+       p++)
     *p = kShadowRodata;
-  internal_write(fd, marker.data(), marker.size() * sizeof(u64));
+  internal_write(fd, marker.data(), marker.size() * sizeof(RawShadow));
   // Map the file into memory.
   uptr page = internal_mmap(0, GetPageSizeCached(), PROT_READ | PROT_WRITE,
                             MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
@@ -203,9 +210,10 @@ static void MapRodata() {
       char *shadow_start = (char *)MemToShadow(segment.start);
       char *shadow_end = (char *)MemToShadow(segment.end);
       for (char *p = shadow_start; p < shadow_end;
-           p += marker.size() * sizeof(u64)) {
-        internal_mmap(p, Min<uptr>(marker.size() * sizeof(u64), shadow_end - p),
-                      PROT_READ, MAP_PRIVATE | MAP_FIXED, fd, 0);
+           p += marker.size() * sizeof(RawShadow)) {
+        internal_mmap(
+            p, Min<uptr>(marker.size() * sizeof(RawShadow), shadow_end - p),
+            PROT_READ, MAP_PRIVATE | MAP_FIXED, fd, 0);
       }
     }
   }
@@ -219,7 +227,6 @@ void InitializeShadowMemoryPlatform() {
 #endif  // #if !SANITIZER_GO
 
 void InitializePlatformEarly() {
-#ifdef TSAN_RUNTIME_VMA
   vmaSize =
     (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
 #if defined(__aarch64__)
@@ -265,7 +272,6 @@ void InitializePlatformEarly() {
   }
 # endif
 #endif
-#endif
 }
 
 void InitializePlatform() {
@@ -341,7 +347,7 @@ int ExtractResolvFDs(void *state, int *fds, int nfd) {
 }
 
 // Extract file descriptors passed via UNIX domain sockets.
-// This is requried to properly handle "open" of these fds.
+// This is required to properly handle "open" of these fds.
 // see 'man recvmsg' and 'man 3 cmsg'.
 int ExtractRecvmsgFDs(void *msgp, int *fds, int nfd) {
   int res = 0;
@@ -447,18 +453,19 @@ static void InitializeLongjmpXorKey() {
 }
 #endif
 
+extern "C" void __tsan_tls_initialization() {}
+
 void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size) {
-  // Check that the thr object is in tls;
   const uptr thr_beg = (uptr)thr;
   const uptr thr_end = (uptr)thr + sizeof(*thr);
-  CHECK_GE(thr_beg, tls_addr);
-  CHECK_LE(thr_beg, tls_addr + tls_size);
-  CHECK_GE(thr_end, tls_addr);
-  CHECK_LE(thr_end, tls_addr + tls_size);
-  // Since the thr object is huge, skip it.
-  MemoryRangeImitateWrite(thr, /*pc=*/2, tls_addr, thr_beg - tls_addr);
-  MemoryRangeImitateWrite(thr, /*pc=*/2, thr_end,
-                          tls_addr + tls_size - thr_end);
+  // ThreadState is normally allocated in TLS and is large,
+  // so we skip it. But unit tests allocate ThreadState outside of TLS.
+  if (thr_beg < tls_addr || thr_end >= tls_addr + tls_size)
+    return;
+  const uptr pc = StackTrace::GetNextInstructionPc(
+      reinterpret_cast<uptr>(__tsan_tls_initialization));
+  MemoryRangeImitateWrite(thr, pc, tls_addr, thr_beg - tls_addr);
+  MemoryRangeImitateWrite(thr, pc, thr_end, tls_addr + tls_size - thr_end);
 }
 
 // Note: this function runs with async signals enabled,
diff --git a/libsanitizer/tsan/tsan_platform_mac.cpp b/libsanitizer/tsan/tsan_platform_mac.cpp
index d9719a1..f2aff77 100644
--- a/libsanitizer/tsan/tsan_platform_mac.cpp
+++ b/libsanitizer/tsan/tsan_platform_mac.cpp
@@ -139,7 +139,7 @@ static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
   *dirty = dirty_pages * GetPageSizeCached();
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
+void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   uptr shadow_res, shadow_dirty;
   uptr meta_res, meta_dirty;
   uptr trace_res, trace_dirty;
@@ -156,10 +156,12 @@ void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
   RegionMemUsage(HeapMemBeg(), HeapMemEnd(), &heap_res, &heap_dirty);
 #else  // !SANITIZER_GO
   uptr app_res, app_dirty;
-  RegionMemUsage(AppMemBeg(), AppMemEnd(), &app_res, &app_dirty);
+  RegionMemUsage(LoAppMemBeg(), LoAppMemEnd(), &app_res, &app_dirty);
 #endif
 
   StackDepotStats *stacks = StackDepotGetStats();
+  uptr nthread, nlive;
+  ctx->thread_registry.GetNumberOfThreads(&nthread, &nlive);
   internal_snprintf(buf, buf_size,
     "shadow   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
     "meta     (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
@@ -169,7 +171,7 @@ void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
     "high app (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
     "heap     (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 #else  // !SANITIZER_GO
-    "app      (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
+      "app      (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 #endif
     "stacks: %zd unique IDs, %zd kB allocated\n"
     "threads: %zd total, %zd live\n"
@@ -182,13 +184,13 @@ void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
     HiAppMemBeg(), HiAppMemEnd(), high_res / 1024, high_dirty / 1024,
     HeapMemBeg(), HeapMemEnd(), heap_res / 1024, heap_dirty / 1024,
 #else  // !SANITIZER_GO
-    AppMemBeg(), AppMemEnd(), app_res / 1024, app_dirty / 1024,
+      LoAppMemBeg(), LoAppMemEnd(), app_res / 1024, app_dirty / 1024,
 #endif
     stacks->n_uniq_ids, stacks->allocated / 1024,
     nthread, nlive);
 }
 
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
 void InitializeShadowMemoryPlatform() { }
 
 // On OS X, GCD worker threads are created without a call to pthread_create. We
@@ -215,8 +217,8 @@ static void my_pthread_introspection_hook(unsigned int event, pthread_t thread,
       Processor *proc = ProcCreate();
       ProcWire(proc, thr);
       ThreadState *parent_thread_state = nullptr;  // No parent.
-      int tid = ThreadCreate(parent_thread_state, 0, (uptr)thread, true);
-      CHECK_NE(tid, 0);
+      Tid tid = ThreadCreate(parent_thread_state, 0, (uptr)thread, true);
+      CHECK_NE(tid, kMainTid);
       ThreadStart(thr, tid, GetTid(), ThreadType::Worker);
     }
   } else if (event == PTHREAD_INTROSPECTION_THREAD_TERMINATE) {
@@ -234,11 +236,11 @@ static void my_pthread_introspection_hook(unsigned int event, pthread_t thread,
 #endif
 
 void InitializePlatformEarly() {
-#if !SANITIZER_GO && !HAS_48_BIT_ADDRESS_SPACE
+#  if !SANITIZER_GO && SANITIZER_IOS
   uptr max_vm = GetMaxUserVirtualAddress() + 1;
-  if (max_vm != Mapping::kHiAppMemEnd) {
+  if (max_vm != HiAppMemEnd()) {
     Printf("ThreadSanitizer: unsupported vm address limit %p, expected %p.\n",
-           max_vm, Mapping::kHiAppMemEnd);
+           max_vm, HiAppMemEnd());
     Die();
   }
 #endif
diff --git a/libsanitizer/tsan/tsan_platform_posix.cpp b/libsanitizer/tsan/tsan_platform_posix.cpp
index 1c6198c..763ac44 100644
--- a/libsanitizer/tsan/tsan_platform_posix.cpp
+++ b/libsanitizer/tsan/tsan_platform_posix.cpp
@@ -14,12 +14,14 @@
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_POSIX
 
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_errno.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "sanitizer_common/sanitizer_procmaps.h"
-#include "tsan_platform.h"
-#include "tsan_rtl.h"
+#  include <dlfcn.h>
+
+#  include "sanitizer_common/sanitizer_common.h"
+#  include "sanitizer_common/sanitizer_errno.h"
+#  include "sanitizer_common/sanitizer_libc.h"
+#  include "sanitizer_common/sanitizer_procmaps.h"
+#  include "tsan_platform.h"
+#  include "tsan_rtl.h"
 
 namespace __tsan {
 
@@ -29,6 +31,7 @@ static const char kShadowMemoryMappingHint[] =
     "HINT: if %s is not supported in your environment, you may set "
     "TSAN_OPTIONS=%s=0\n";
 
+#  if !SANITIZER_GO
 static void DontDumpShadow(uptr addr, uptr size) {
   if (common_flags()->use_madv_dontdump)
     if (!DontDumpShadowMemory(addr, size)) {
@@ -39,7 +42,6 @@ static void DontDumpShadow(uptr addr, uptr size) {
     }
 }
 
-#if !SANITIZER_GO
 void InitializeShadowMemory() {
   // Map memory shadow.
   if (!MmapFixedSuperNoReserve(ShadowBeg(), ShadowEnd() - ShadowBeg(),
@@ -70,6 +72,11 @@ void InitializeShadowMemory() {
       meta, meta + meta_size, meta_size >> 30);
 
   InitializeShadowMemoryPlatform();
+
+  on_initialize = reinterpret_cast<void (*)(void)>(
+      dlsym(RTLD_DEFAULT, "__tsan_on_initialize"));
+  on_finalize =
+      reinterpret_cast<int (*)(int)>(dlsym(RTLD_DEFAULT, "__tsan_on_finalize"));
 }
 
 static bool TryProtectRange(uptr beg, uptr end) {
@@ -98,24 +105,24 @@ void CheckAndProtect() {
       continue;
     if (segment.start >= VdsoBeg())  // vdso
       break;
-    Printf("FATAL: ThreadSanitizer: unexpected memory mapping %p-%p\n",
+    Printf("FATAL: ThreadSanitizer: unexpected memory mapping 0x%zx-0x%zx\n",
            segment.start, segment.end);
     Die();
   }
 
-#if defined(__aarch64__) && defined(__APPLE__) && !HAS_48_BIT_ADDRESS_SPACE
+#    if defined(__aarch64__) && defined(__APPLE__) && SANITIZER_IOS
   ProtectRange(HeapMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
   ProtectRange(MetaShadowEnd(), TraceMemBeg());
 #else
   ProtectRange(LoAppMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
-#ifdef TSAN_MID_APP_RANGE
-  ProtectRange(MetaShadowEnd(), MidAppMemBeg());
-  ProtectRange(MidAppMemEnd(), TraceMemBeg());
-#else
-  ProtectRange(MetaShadowEnd(), TraceMemBeg());
-#endif
+  if (MidAppMemBeg()) {
+    ProtectRange(MetaShadowEnd(), MidAppMemBeg());
+    ProtectRange(MidAppMemEnd(), TraceMemBeg());
+  } else {
+    ProtectRange(MetaShadowEnd(), TraceMemBeg());
+  }
   // Memory for traces is mapped lazily in MapThreadTrace.
   // Protect the whole range for now, so that user does not map something here.
   ProtectRange(TraceMemBeg(), TraceMemEnd());
diff --git a/libsanitizer/tsan/tsan_platform_windows.cpp b/libsanitizer/tsan/tsan_platform_windows.cpp
index 1943787..fea8937 100644
--- a/libsanitizer/tsan/tsan_platform_windows.cpp
+++ b/libsanitizer/tsan/tsan_platform_windows.cpp
@@ -23,8 +23,7 @@ namespace __tsan {
 void FlushShadowMemory() {
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
-}
+void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {}
 
 void InitializePlatformEarly() {
 }
diff --git a/libsanitizer/tsan/tsan_report.cpp b/libsanitizer/tsan/tsan_report.cpp
index 8ef9f0c..a926c37 100644
--- a/libsanitizer/tsan/tsan_report.cpp
+++ b/libsanitizer/tsan/tsan_report.cpp
@@ -19,22 +19,6 @@
 
 namespace __tsan {
 
-ReportStack::ReportStack() : frames(nullptr), suppressable(false) {}
-
-ReportStack *ReportStack::New() {
-  void *mem = internal_alloc(MBlockReportStack, sizeof(ReportStack));
-  return new(mem) ReportStack();
-}
-
-ReportLocation::ReportLocation(ReportLocationType type)
-    : type(type), global(), heap_chunk_start(0), heap_chunk_size(0), tid(0),
-      fd(0), suppressable(false), stack(nullptr) {}
-
-ReportLocation *ReportLocation::New(ReportLocationType type) {
-  void *mem = internal_alloc(MBlockReportStack, sizeof(ReportLocation));
-  return new(mem) ReportLocation(type);
-}
-
 class Decorator: public __sanitizer::SanitizerCommonDecorator {
  public:
   Decorator() : SanitizerCommonDecorator() { }
@@ -68,7 +52,7 @@ ReportDesc::~ReportDesc() {
 #if !SANITIZER_GO
 
 const int kThreadBufSize = 32;
-const char *thread_name(char *buf, int tid) {
+const char *thread_name(char *buf, Tid tid) {
   if (tid == kMainTid)
     return "main thread";
   internal_snprintf(buf, kThreadBufSize, "thread T%d", tid);
@@ -189,23 +173,25 @@ static void PrintLocation(const ReportLocation *loc) {
   if (loc->type == ReportLocationGlobal) {
     const DataInfo &global = loc->global;
     if (global.size != 0)
-      Printf("  Location is global '%s' of size %zu at %p (%s+%p)\n\n",
-             global.name, global.size, global.start,
+      Printf("  Location is global '%s' of size %zu at %p (%s+0x%zx)\n\n",
+             global.name, global.size, reinterpret_cast<void *>(global.start),
              StripModuleName(global.module), global.module_offset);
     else
-      Printf("  Location is global '%s' at %p (%s+%p)\n\n", global.name,
-             global.start, StripModuleName(global.module),
-             global.module_offset);
+      Printf("  Location is global '%s' at %p (%s+0x%zx)\n\n", global.name,
+             reinterpret_cast<void *>(global.start),
+             StripModuleName(global.module), global.module_offset);
   } else if (loc->type == ReportLocationHeap) {
     char thrbuf[kThreadBufSize];
     const char *object_type = GetObjectTypeFromTag(loc->external_tag);
     if (!object_type) {
       Printf("  Location is heap block of size %zu at %p allocated by %s:\n",
-             loc->heap_chunk_size, loc->heap_chunk_start,
+             loc->heap_chunk_size,
+             reinterpret_cast<void *>(loc->heap_chunk_start),
              thread_name(thrbuf, loc->tid));
     } else {
       Printf("  Location is %s of size %zu at %p allocated by %s:\n",
-             object_type, loc->heap_chunk_size, loc->heap_chunk_start,
+             object_type, loc->heap_chunk_size,
+             reinterpret_cast<void *>(loc->heap_chunk_start),
              thread_name(thrbuf, loc->tid));
     }
     print_stack = true;
@@ -225,13 +211,14 @@ static void PrintLocation(const ReportLocation *loc) {
 
 static void PrintMutexShort(const ReportMutex *rm, const char *after) {
   Decorator d;
-  Printf("%sM%zd%s%s", d.Mutex(), rm->id, d.Default(), after);
+  Printf("%sM%lld%s%s", d.Mutex(), rm->id, d.Default(), after);
 }
 
 static void PrintMutexShortWithAddress(const ReportMutex *rm,
                                        const char *after) {
   Decorator d;
-  Printf("%sM%zd (%p)%s%s", d.Mutex(), rm->id, rm->addr, d.Default(), after);
+  Printf("%sM%lld (%p)%s%s", d.Mutex(), rm->id,
+         reinterpret_cast<void *>(rm->addr), d.Default(), after);
 }
 
 static void PrintMutex(const ReportMutex *rm) {
@@ -242,7 +229,8 @@ static void PrintMutex(const ReportMutex *rm) {
     Printf("%s", d.Default());
   } else {
     Printf("%s", d.Mutex());
-    Printf("  Mutex M%llu (%p) created at:\n", rm->id, rm->addr);
+    Printf("  Mutex M%llu (%p) created at:\n", rm->id,
+           reinterpret_cast<void *>(rm->addr));
     Printf("%s", d.Default());
     PrintStack(rm->stack);
   }
@@ -259,12 +247,13 @@ static void PrintThread(const ReportThread *rt) {
   char thrbuf[kThreadBufSize];
   const char *thread_status = rt->running ? "running" : "finished";
   if (rt->thread_type == ThreadType::Worker) {
-    Printf(" (tid=%zu, %s) is a GCD worker thread\n", rt->os_id, thread_status);
+    Printf(" (tid=%llu, %s) is a GCD worker thread\n", rt->os_id,
+           thread_status);
     Printf("\n");
     Printf("%s", d.Default());
     return;
   }
-  Printf(" (tid=%zu, %s) created by %s", rt->os_id, thread_status,
+  Printf(" (tid=%llu, %s) created by %s", rt->os_id, thread_status,
          thread_name(thrbuf, rt->parent_tid));
   if (rt->stack)
     Printf(" at:");
@@ -394,7 +383,7 @@ void PrintReport(const ReportDesc *rep) {
 
 #else  // #if !SANITIZER_GO
 
-const u32 kMainGoroutineId = 1;
+const Tid kMainGoroutineId = 1;
 
 void PrintStack(const ReportStack *ent) {
   if (ent == 0 || ent->frames == 0) {
@@ -405,16 +394,17 @@ void PrintStack(const ReportStack *ent) {
   for (int i = 0; frame; frame = frame->next, i++) {
     const AddressInfo &info = frame->info;
     Printf("  %s()\n      %s:%d +0x%zx\n", info.function,
-        StripPathPrefix(info.file, common_flags()->strip_path_prefix),
-        info.line, (void *)info.module_offset);
+           StripPathPrefix(info.file, common_flags()->strip_path_prefix),
+           info.line, info.module_offset);
   }
 }
 
 static void PrintMop(const ReportMop *mop, bool first) {
   Printf("\n");
   Printf("%s at %p by ",
-      (first ? (mop->write ? "Write" : "Read")
-             : (mop->write ? "Previous write" : "Previous read")), mop->addr);
+         (first ? (mop->write ? "Write" : "Read")
+                : (mop->write ? "Previous write" : "Previous read")),
+         reinterpret_cast<void *>(mop->addr));
   if (mop->tid == kMainGoroutineId)
     Printf("main goroutine:\n");
   else
@@ -426,8 +416,8 @@ static void PrintLocation(const ReportLocation *loc) {
   switch (loc->type) {
   case ReportLocationHeap: {
     Printf("\n");
-    Printf("Heap block of size %zu at %p allocated by ",
-        loc->heap_chunk_size, loc->heap_chunk_start);
+    Printf("Heap block of size %zu at %p allocated by ", loc->heap_chunk_size,
+           reinterpret_cast<void *>(loc->heap_chunk_start));
     if (loc->tid == kMainGoroutineId)
       Printf("main goroutine:\n");
     else
@@ -438,8 +428,9 @@ static void PrintLocation(const ReportLocation *loc) {
   case ReportLocationGlobal: {
     Printf("\n");
     Printf("Global var %s of size %zu at %p declared at %s:%zu\n",
-        loc->global.name, loc->global.size, loc->global.start,
-        loc->global.file, loc->global.line);
+           loc->global.name, loc->global.size,
+           reinterpret_cast<void *>(loc->global.start), loc->global.file,
+           loc->global.line);
     break;
   }
   default:
@@ -469,13 +460,13 @@ void PrintReport(const ReportDesc *rep) {
   } else if (rep->typ == ReportTypeDeadlock) {
     Printf("WARNING: DEADLOCK\n");
     for (uptr i = 0; i < rep->mutexes.Size(); i++) {
-      Printf("Goroutine %d lock mutex %d while holding mutex %d:\n",
-          999, rep->mutexes[i]->id,
-          rep->mutexes[(i+1) % rep->mutexes.Size()]->id);
+      Printf("Goroutine %d lock mutex %llu while holding mutex %llu:\n", 999,
+             rep->mutexes[i]->id,
+             rep->mutexes[(i + 1) % rep->mutexes.Size()]->id);
       PrintStack(rep->stacks[2*i]);
       Printf("\n");
-      Printf("Mutex %d was previously locked here:\n",
-          rep->mutexes[(i+1) % rep->mutexes.Size()]->id);
+      Printf("Mutex %llu was previously locked here:\n",
+             rep->mutexes[(i + 1) % rep->mutexes.Size()]->id);
       PrintStack(rep->stacks[2*i + 1]);
       Printf("\n");
     }
diff --git a/libsanitizer/tsan/tsan_report.h b/libsanitizer/tsan/tsan_report.h
index b4e4d89..d68c2db 100644
--- a/libsanitizer/tsan/tsan_report.h
+++ b/libsanitizer/tsan/tsan_report.h
@@ -38,12 +38,8 @@ enum ReportType {
 };
 
 struct ReportStack {
-  SymbolizedStack *frames;
-  bool suppressable;
-  static ReportStack *New();
-
- private:
-  ReportStack();
+  SymbolizedStack *frames = nullptr;
+  bool suppressable = false;
 };
 
 struct ReportMopMutex {
@@ -73,28 +69,24 @@ enum ReportLocationType {
 };
 
 struct ReportLocation {
-  ReportLocationType type;
-  DataInfo global;
-  uptr heap_chunk_start;
-  uptr heap_chunk_size;
-  uptr external_tag;
-  int tid;
-  int fd;
-  bool suppressable;
-  ReportStack *stack;
-
-  static ReportLocation *New(ReportLocationType type);
- private:
-  explicit ReportLocation(ReportLocationType type);
+  ReportLocationType type = ReportLocationGlobal;
+  DataInfo global = {};
+  uptr heap_chunk_start = 0;
+  uptr heap_chunk_size = 0;
+  uptr external_tag = 0;
+  Tid tid = kInvalidTid;
+  int fd = 0;
+  bool suppressable = false;
+  ReportStack *stack = nullptr;
 };
 
 struct ReportThread {
-  int id;
+  Tid id;
   tid_t os_id;
   bool running;
   ThreadType thread_type;
   char *name;
-  u32 parent_tid;
+  Tid parent_tid;
   ReportStack *stack;
 };
 
@@ -114,7 +106,7 @@ class ReportDesc {
   Vector<ReportLocation*> locs;
   Vector<ReportMutex*> mutexes;
   Vector<ReportThread*> threads;
-  Vector<int> unique_tids;
+  Vector<Tid> unique_tids;
   ReportStack *sleep;
   int count;
 
diff --git a/libsanitizer/tsan/tsan_rtl.cpp b/libsanitizer/tsan/tsan_rtl.cpp
index bcf489a..d679282 100644
--- a/libsanitizer/tsan/tsan_rtl.cpp
+++ b/libsanitizer/tsan/tsan_rtl.cpp
@@ -28,16 +28,6 @@
 #include "tsan_symbolize.h"
 #include "ubsan/ubsan_init.h"
 
-#ifdef __SSE3__
-// <emmintrin.h> transitively includes <stdlib.h>,
-// and it's prohibited to include std headers into tsan runtime.
-// So we do this dirty trick.
-#define _MM_MALLOC_H_INCLUDED
-#define __MM_MALLOC_H
-#include <emmintrin.h>
-typedef __m128i m128;
-#endif
-
 volatile int __tsan_resumed = 0;
 
 extern "C" void __tsan_resume() {
@@ -46,6 +36,11 @@ extern "C" void __tsan_resume() {
 
 namespace __tsan {
 
+#if !SANITIZER_GO
+void (*on_initialize)(void);
+int (*on_finalize)(int);
+#endif
+
 #if !SANITIZER_GO && !SANITIZER_MAC
 __attribute__((tls_model("initial-exec")))
 THREADLOCAL char cur_thread_placeholder[sizeof(ThreadState)] ALIGNED(64);
@@ -62,24 +57,21 @@ void OnInitialize();
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 bool OnFinalize(bool failed) {
 #if !SANITIZER_GO
-  if (auto *ptr = dlsym(RTLD_DEFAULT, "__tsan_on_finalize"))
-    return reinterpret_cast<decltype(&__tsan_on_finalize)>(ptr)(failed);
+  if (on_finalize)
+    return on_finalize(failed);
 #endif
   return failed;
 }
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 void OnInitialize() {
 #if !SANITIZER_GO
-  if (auto *ptr = dlsym(RTLD_DEFAULT, "__tsan_on_initialize")) {
-    return reinterpret_cast<decltype(&__tsan_on_initialize)>(ptr)();
-  }
+  if (on_initialize)
+    on_initialize();
 #endif
 }
 #endif
 
-static ALIGNED(64) char thread_registry_placeholder[sizeof(ThreadRegistry)];
-
-static ThreadContextBase *CreateThreadContext(u32 tid) {
+static ThreadContextBase *CreateThreadContext(Tid tid) {
   // Map thread trace when context is created.
   char name[50];
   internal_snprintf(name, sizeof(name), "trace %u", tid);
@@ -98,13 +90,12 @@ static ThreadContextBase *CreateThreadContext(u32 tid) {
     ReleaseMemoryPagesToOS(hdr_end, hdr + sizeof(Trace));
     uptr unused = hdr + sizeof(Trace) - hdr_end;
     if (hdr_end != (uptr)MmapFixedNoAccess(hdr_end, unused)) {
-      Report("ThreadSanitizer: failed to mprotect(%p, %p)\n",
-          hdr_end, unused);
+      Report("ThreadSanitizer: failed to mprotect [0x%zx-0x%zx) \n", hdr_end,
+             unused);
       CHECK("unable to mprotect" && 0);
     }
   }
-  void *mem = internal_alloc(MBlockThreadContex, sizeof(ThreadContext));
-  return new(mem) ThreadContext(tid);
+  return New<ThreadContext>(tid);
 }
 
 #if !SANITIZER_GO
@@ -117,9 +108,8 @@ Context::Context()
     : initialized(),
       report_mtx(MutexTypeReport),
       nreported(),
-      nmissed_expected(),
-      thread_registry(new (thread_registry_placeholder) ThreadRegistry(
-          CreateThreadContext, kMaxTid, kThreadQuarantineSize, kMaxTidReuse)),
+      thread_registry(CreateThreadContext, kMaxTid, kThreadQuarantineSize,
+                      kMaxTidReuse),
       racy_mtx(MutexTypeRacy),
       racy_stacks(),
       racy_addresses(),
@@ -129,7 +119,7 @@ Context::Context()
 }
 
 // The objects are allocated in TLS, so one may rely on zero-initialization.
-ThreadState::ThreadState(Context *ctx, u32 tid, int unique_id, u64 epoch,
+ThreadState::ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
                          unsigned reuse_count, uptr stk_addr, uptr stk_size,
                          uptr tls_addr, uptr tls_size)
     : fast_state(tid, epoch)
@@ -155,16 +145,49 @@ ThreadState::ThreadState(Context *ctx, u32 tid, int unique_id, u64 epoch,
       last_sleep_clock(tid)
 #endif
 {
+  CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
+#if !SANITIZER_GO
+  shadow_stack_pos = shadow_stack;
+  shadow_stack_end = shadow_stack + kShadowStackSize;
+#else
+  // Setup dynamic shadow stack.
+  const int kInitStackSize = 8;
+  shadow_stack = (uptr *)Alloc(kInitStackSize * sizeof(uptr));
+  shadow_stack_pos = shadow_stack;
+  shadow_stack_end = shadow_stack + kInitStackSize;
+#endif
 }
 
 #if !SANITIZER_GO
-static void MemoryProfiler(Context *ctx, fd_t fd, int i) {
-  uptr n_threads;
-  uptr n_running_threads;
-  ctx->thread_registry->GetNumberOfThreads(&n_threads, &n_running_threads);
+void MemoryProfiler(u64 uptime) {
+  if (ctx->memprof_fd == kInvalidFd)
+    return;
   InternalMmapVector<char> buf(4096);
-  WriteMemoryProfile(buf.data(), buf.size(), n_threads, n_running_threads);
-  WriteToFile(fd, buf.data(), internal_strlen(buf.data()));
+  WriteMemoryProfile(buf.data(), buf.size(), uptime);
+  WriteToFile(ctx->memprof_fd, buf.data(), internal_strlen(buf.data()));
+}
+
+void InitializeMemoryProfiler() {
+  ctx->memprof_fd = kInvalidFd;
+  const char *fname = flags()->profile_memory;
+  if (!fname || !fname[0])
+    return;
+  if (internal_strcmp(fname, "stdout") == 0) {
+    ctx->memprof_fd = 1;
+  } else if (internal_strcmp(fname, "stderr") == 0) {
+    ctx->memprof_fd = 2;
+  } else {
+    InternalScopedString filename;
+    filename.append("%s.%d", fname, (int)internal_getpid());
+    ctx->memprof_fd = OpenFile(filename.data(), WrOnly);
+    if (ctx->memprof_fd == kInvalidFd) {
+      Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
+             filename.data());
+      return;
+    }
+  }
+  MemoryProfiler(0);
+  MaybeSpawnBackgroundThread();
 }
 
 static void *BackgroundThread(void *arg) {
@@ -175,25 +198,7 @@ static void *BackgroundThread(void *arg) {
   cur_thread_init();
   cur_thread()->ignore_interceptors++;
   const u64 kMs2Ns = 1000 * 1000;
-
-  fd_t mprof_fd = kInvalidFd;
-  if (flags()->profile_memory && flags()->profile_memory[0]) {
-    if (internal_strcmp(flags()->profile_memory, "stdout") == 0) {
-      mprof_fd = 1;
-    } else if (internal_strcmp(flags()->profile_memory, "stderr") == 0) {
-      mprof_fd = 2;
-    } else {
-      InternalScopedString filename;
-      filename.append("%s.%d", flags()->profile_memory, (int)internal_getpid());
-      fd_t fd = OpenFile(filename.data(), WrOnly);
-      if (fd == kInvalidFd) {
-        Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
-               filename.data());
-      } else {
-        mprof_fd = fd;
-      }
-    }
-  }
+  const u64 start = NanoTime();
 
   u64 last_flush = NanoTime();
   uptr last_rss = 0;
@@ -211,7 +216,6 @@ static void *BackgroundThread(void *arg) {
         last_flush = NanoTime();
       }
     }
-    // GetRSS can be expensive on huge programs, so don't do it every 100ms.
     if (flags()->memory_limit_mb > 0) {
       uptr rss = GetRSS();
       uptr limit = uptr(flags()->memory_limit_mb) << 20;
@@ -227,9 +231,7 @@ static void *BackgroundThread(void *arg) {
       last_rss = rss;
     }
 
-    // Write memory profile if requested.
-    if (mprof_fd != kInvalidFd)
-      MemoryProfiler(ctx, mprof_fd, i);
+    MemoryProfiler(now - start);
 
     // Flush symbolizer cache if requested.
     if (flags()->flush_symbolizer_ms > 0) {
@@ -260,7 +262,8 @@ static void StopBackgroundThread() {
 #endif
 
 void DontNeedShadowFor(uptr addr, uptr size) {
-  ReleaseMemoryPagesToOS(MemToShadow(addr), MemToShadow(addr + size));
+  ReleaseMemoryPagesToOS(reinterpret_cast<uptr>(MemToShadow(addr)),
+                         reinterpret_cast<uptr>(MemToShadow(addr + size)));
 }
 
 #if !SANITIZER_GO
@@ -297,7 +300,7 @@ void MapShadow(uptr addr, uptr size) {
                                  "meta shadow"))
       Die();
   } else {
-    // Mapping continous heap.
+    // Mapping continuous heap.
     // Windows wants 64K alignment.
     meta_begin = RoundDownTo(meta_begin, 64 << 10);
     meta_end = RoundUpTo(meta_end, 64 << 10);
@@ -310,58 +313,22 @@ void MapShadow(uptr addr, uptr size) {
       Die();
     mapped_meta_end = meta_end;
   }
-  VPrintf(2, "mapped meta shadow for (%p-%p) at (%p-%p)\n",
-      addr, addr+size, meta_begin, meta_end);
+  VPrintf(2, "mapped meta shadow for (0x%zx-0x%zx) at (0x%zx-0x%zx)\n", addr,
+          addr + size, meta_begin, meta_end);
 }
 
 void MapThreadTrace(uptr addr, uptr size, const char *name) {
-  DPrintf("#0: Mapping trace at %p-%p(0x%zx)\n", addr, addr + size, size);
+  DPrintf("#0: Mapping trace at 0x%zx-0x%zx(0x%zx)\n", addr, addr + size, size);
   CHECK_GE(addr, TraceMemBeg());
   CHECK_LE(addr + size, TraceMemEnd());
   CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
   if (!MmapFixedSuperNoReserve(addr, size, name)) {
-    Printf("FATAL: ThreadSanitizer can not mmap thread trace (%p/%p)\n",
-        addr, size);
+    Printf("FATAL: ThreadSanitizer can not mmap thread trace (0x%zx/0x%zx)\n",
+           addr, size);
     Die();
   }
 }
 
-static void CheckShadowMapping() {
-  uptr beg, end;
-  for (int i = 0; GetUserRegion(i, &beg, &end); i++) {
-    // Skip cases for empty regions (heap definition for architectures that
-    // do not use 64-bit allocator).
-    if (beg == end)
-      continue;
-    VPrintf(3, "checking shadow region %p-%p\n", beg, end);
-    uptr prev = 0;
-    for (uptr p0 = beg; p0 <= end; p0 += (end - beg) / 4) {
-      for (int x = -(int)kShadowCell; x <= (int)kShadowCell; x += kShadowCell) {
-        const uptr p = RoundDown(p0 + x, kShadowCell);
-        if (p < beg || p >= end)
-          continue;
-        const uptr s = MemToShadow(p);
-        const uptr m = (uptr)MemToMeta(p);
-        VPrintf(3, "  checking pointer %p: shadow=%p meta=%p\n", p, s, m);
-        CHECK(IsAppMem(p));
-        CHECK(IsShadowMem(s));
-        CHECK_EQ(p, ShadowToMem(s));
-        CHECK(IsMetaMem(m));
-        if (prev) {
-          // Ensure that shadow and meta mappings are linear within a single
-          // user range. Lots of code that processes memory ranges assumes it.
-          const uptr prev_s = MemToShadow(prev);
-          const uptr prev_m = (uptr)MemToMeta(prev);
-          CHECK_EQ(s - prev_s, (p - prev) * kShadowMultiplier);
-          CHECK_EQ((m - prev_m) / kMetaShadowSize,
-                   (p - prev) / kMetaShadowCell);
-        }
-        prev = p;
-      }
-    }
-  }
-}
-
 #if !SANITIZER_GO
 static void OnStackUnwind(const SignalContext &sig, const void *,
                           BufferedStackTrace *stack) {
@@ -386,9 +353,10 @@ void CheckUnwind() {
   PrintCurrentStackSlow(StackTrace::GetCurrentPc());
 }
 
+bool is_initialized;
+
 void Initialize(ThreadState *thr) {
   // Thread safe because done before all threads exist.
-  static bool is_initialized = false;
   if (is_initialized)
     return;
   is_initialized = true;
@@ -420,9 +388,7 @@ void Initialize(ThreadState *thr) {
   Processor *proc = ProcCreate();
   ProcWire(proc, thr);
   InitializeInterceptors();
-  CheckShadowMapping();
   InitializePlatform();
-  InitializeMutex();
   InitializeDynamicAnnotations();
 #if !SANITIZER_GO
   InitializeShadowMemory();
@@ -441,8 +407,8 @@ void Initialize(ThreadState *thr) {
           (int)internal_getpid());
 
   // Initialize thread 0.
-  int tid = ThreadCreate(thr, 0, 0, true);
-  CHECK_EQ(tid, 0);
+  Tid tid = ThreadCreate(thr, 0, 0, true);
+  CHECK_EQ(tid, kMainTid);
   ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
 #if TSAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
@@ -451,6 +417,7 @@ void Initialize(ThreadState *thr) {
 
 #if !SANITIZER_GO
   Symbolizer::LateInitialize();
+  InitializeMemoryProfiler();
 #endif
 
   if (flags()->stop_on_start) {
@@ -507,18 +474,8 @@ int Finalize(ThreadState *thr) {
 #endif
   }
 
-  if (ctx->nmissed_expected) {
-    failed = true;
-    Printf("ThreadSanitizer: missed %d expected races\n",
-        ctx->nmissed_expected);
-  }
-
   if (common_flags()->print_suppressions)
     PrintMatchedSuppressions();
-#if !SANITIZER_GO
-  if (flags()->print_benign)
-    PrintMatchedBenignRaces();
-#endif
 
   failed = OnFinalize(failed);
 
@@ -527,7 +484,7 @@ int Finalize(ThreadState *thr) {
 
 #if !SANITIZER_GO
 void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
-  ctx->thread_registry->Lock();
+  ctx->thread_registry.Lock();
   ctx->report_mtx.Lock();
   ScopedErrorReportLock::Lock();
   // Suppress all reports in the pthread_atfork callbacks.
@@ -546,7 +503,7 @@ void ForkParentAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
   thr->ignore_interceptors--;
   ScopedErrorReportLock::Unlock();
   ctx->report_mtx.Unlock();
-  ctx->thread_registry->Unlock();
+  ctx->thread_registry.Unlock();
 }
 
 void ForkChildAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
@@ -554,10 +511,10 @@ void ForkChildAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
   thr->ignore_interceptors--;
   ScopedErrorReportLock::Unlock();
   ctx->report_mtx.Unlock();
-  ctx->thread_registry->Unlock();
+  ctx->thread_registry.Unlock();
 
   uptr nthread = 0;
-  ctx->thread_registry->GetNumberOfThreads(0, 0, &nthread /* alive threads */);
+  ctx->thread_registry.GetNumberOfThreads(0, 0, &nthread /* alive threads */);
   VPrintf(1, "ThreadSanitizer: forked new process with pid %d,"
       " parent had %d threads\n", (int)internal_getpid(), (int)nthread);
   if (nthread == 1) {
@@ -579,19 +536,18 @@ NOINLINE
 void GrowShadowStack(ThreadState *thr) {
   const int sz = thr->shadow_stack_end - thr->shadow_stack;
   const int newsz = 2 * sz;
-  uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
-      newsz * sizeof(uptr));
+  auto *newstack = (uptr *)Alloc(newsz * sizeof(uptr));
   internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
-  internal_free(thr->shadow_stack);
+  Free(thr->shadow_stack);
   thr->shadow_stack = newstack;
   thr->shadow_stack_pos = newstack + sz;
   thr->shadow_stack_end = newstack + newsz;
 }
 #endif
 
-u32 CurrentStackId(ThreadState *thr, uptr pc) {
+StackID CurrentStackId(ThreadState *thr, uptr pc) {
   if (!thr->is_inited)  // May happen during bootstrap.
-    return 0;
+    return kInvalidStackID;
   if (pc != 0) {
 #if !SANITIZER_GO
     DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -602,13 +558,195 @@ u32 CurrentStackId(ThreadState *thr, uptr pc) {
     thr->shadow_stack_pos[0] = pc;
     thr->shadow_stack_pos++;
   }
-  u32 id = StackDepotPut(
+  StackID id = StackDepotPut(
       StackTrace(thr->shadow_stack, thr->shadow_stack_pos - thr->shadow_stack));
   if (pc != 0)
     thr->shadow_stack_pos--;
   return id;
 }
 
+namespace v3 {
+
+ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
+                                             uptr addr, uptr size,
+                                             AccessType typ) {
+  DCHECK(size == 1 || size == 2 || size == 4 || size == 8);
+  if (!kCollectHistory)
+    return true;
+  EventAccess *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3;
+  uptr pc_delta = pc - thr->trace_prev_pc + (1 << (EventAccess::kPCBits - 1));
+  thr->trace_prev_pc = pc;
+  if (LIKELY(pc_delta < (1 << EventAccess::kPCBits))) {
+    ev->is_access = 1;
+    ev->is_read = !!(typ & kAccessRead);
+    ev->is_atomic = !!(typ & kAccessAtomic);
+    ev->size_log = size_log;
+    ev->pc_delta = pc_delta;
+    DCHECK_EQ(ev->pc_delta, pc_delta);
+    ev->addr = CompressAddr(addr);
+    TraceRelease(thr, ev);
+    return true;
+  }
+  auto *evex = reinterpret_cast<EventAccessExt *>(ev);
+  evex->is_access = 0;
+  evex->is_func = 0;
+  evex->type = EventType::kAccessExt;
+  evex->is_read = !!(typ & kAccessRead);
+  evex->is_atomic = !!(typ & kAccessAtomic);
+  evex->size_log = size_log;
+  evex->addr = CompressAddr(addr);
+  evex->pc = pc;
+  TraceRelease(thr, evex);
+  return true;
+}
+
+ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
+                                                  uptr addr, uptr size,
+                                                  AccessType typ) {
+  if (!kCollectHistory)
+    return true;
+  EventAccessRange *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  thr->trace_prev_pc = pc;
+  ev->is_access = 0;
+  ev->is_func = 0;
+  ev->type = EventType::kAccessRange;
+  ev->is_read = !!(typ & kAccessRead);
+  ev->is_free = !!(typ & kAccessFree);
+  ev->size_lo = size;
+  ev->pc = CompressAddr(pc);
+  ev->addr = CompressAddr(addr);
+  ev->size_hi = size >> EventAccessRange::kSizeLoBits;
+  TraceRelease(thr, ev);
+  return true;
+}
+
+void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                            AccessType typ) {
+  if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ)))
+    return;
+  TraceSwitchPart(thr);
+  UNUSED bool res = TryTraceMemoryAccessRange(thr, pc, addr, size, typ);
+  DCHECK(res);
+}
+
+void TraceFunc(ThreadState *thr, uptr pc) {
+  if (LIKELY(TryTraceFunc(thr, pc)))
+    return;
+  TraceSwitchPart(thr);
+  UNUSED bool res = TryTraceFunc(thr, pc);
+  DCHECK(res);
+}
+
+void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+                    StackID stk) {
+  DCHECK(type == EventType::kLock || type == EventType::kRLock);
+  if (!kCollectHistory)
+    return;
+  EventLock ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = type;
+  ev.pc = CompressAddr(pc);
+  ev.stack_lo = stk;
+  ev.stack_hi = stk >> EventLock::kStackIDLoBits;
+  ev._ = 0;
+  ev.addr = CompressAddr(addr);
+  TraceEvent(thr, ev);
+}
+
+void TraceMutexUnlock(ThreadState *thr, uptr addr) {
+  if (!kCollectHistory)
+    return;
+  EventUnlock ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = EventType::kUnlock;
+  ev._ = 0;
+  ev.addr = CompressAddr(addr);
+  TraceEvent(thr, ev);
+}
+
+void TraceTime(ThreadState *thr) {
+  if (!kCollectHistory)
+    return;
+  EventTime ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = EventType::kTime;
+  ev.sid = static_cast<u64>(thr->sid);
+  ev.epoch = static_cast<u64>(thr->epoch);
+  ev._ = 0;
+  TraceEvent(thr, ev);
+}
+
+NOINLINE
+void TraceSwitchPart(ThreadState *thr) {
+  Trace *trace = &thr->tctx->trace;
+  Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
+  DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
+  auto *part = trace->parts.Back();
+  DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos);
+  if (part) {
+    // We can get here when we still have space in the current trace part.
+    // The fast-path check in TraceAcquire has false positives in the middle of
+    // the part. Check if we are indeed at the end of the current part or not,
+    // and fill any gaps with NopEvent's.
+    Event *end = &part->events[TracePart::kSize];
+    DCHECK_GE(pos, &part->events[0]);
+    DCHECK_LE(pos, end);
+    if (pos + 1 < end) {
+      if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
+          TracePart::kAlignment)
+        *pos++ = NopEvent;
+      *pos++ = NopEvent;
+      DCHECK_LE(pos + 2, end);
+      atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
+      // Ensure we setup trace so that the next TraceAcquire
+      // won't detect trace part end.
+      Event *ev;
+      CHECK(TraceAcquire(thr, &ev));
+      return;
+    }
+    // We are indeed at the end.
+    for (; pos < end; pos++) *pos = NopEvent;
+  }
+#if !SANITIZER_GO
+  if (ctx->after_multithreaded_fork) {
+    // We just need to survive till exec.
+    CHECK(part);
+    atomic_store_relaxed(&thr->trace_pos,
+                         reinterpret_cast<uptr>(&part->events[0]));
+    return;
+  }
+#endif
+  part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart();
+  part->trace = trace;
+  thr->trace_prev_pc = 0;
+  {
+    Lock lock(&trace->mtx);
+    trace->parts.PushBack(part);
+    atomic_store_relaxed(&thr->trace_pos,
+                         reinterpret_cast<uptr>(&part->events[0]));
+  }
+  // Make this part self-sufficient by restoring the current stack
+  // and mutex set in the beginning of the trace.
+  TraceTime(thr);
+  for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++)
+    CHECK(TryTraceFunc(thr, *pos));
+  for (uptr i = 0; i < thr->mset.Size(); i++) {
+    MutexSet::Desc d = thr->mset.Get(i);
+    TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
+                   d.addr, d.stack_id);
+  }
+}
+
+}  // namespace v3
+
 void TraceSwitch(ThreadState *thr) {
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork)
@@ -625,9 +763,7 @@ void TraceSwitch(ThreadState *thr) {
   thr->nomalloc--;
 }
 
-Trace *ThreadTrace(int tid) {
-  return (Trace*)GetThreadTraceHeader(tid);
-}
+Trace *ThreadTrace(Tid tid) { return (Trace *)GetThreadTraceHeader(tid); }
 
 uptr TraceTopPC(ThreadState *thr) {
   Event *events = (Event*)GetThreadTrace(thr->tid);
@@ -716,28 +852,28 @@ void MemoryAccessImpl1(ThreadState *thr, uptr addr,
   // threads, which is not enough for the unrolled loop.
 #if SANITIZER_DEBUG
   for (int idx = 0; idx < 4; idx++) {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   }
 #else
   int idx = 0;
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   idx = 1;
   if (stored) {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   } else {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   }
   idx = 2;
   if (stored) {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   } else {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   }
   idx = 3;
   if (stored) {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   } else {
-#include "tsan_update_shadow_word_inl.h"
+#  include "tsan_update_shadow_word.inc"
   }
 #endif
 
@@ -753,8 +889,11 @@ void MemoryAccessImpl1(ThreadState *thr, uptr addr,
   return;
 }
 
-void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int size, bool kAccessIsWrite, bool kIsAtomic) {
+void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                           AccessType typ) {
+  DCHECK(!(typ & kAccessAtomic));
+  const bool kAccessIsWrite = !(typ & kAccessRead);
+  const bool kIsAtomic = false;
   while (size) {
     int size1 = 1;
     int kAccessSizeLog = kSizeLog1;
@@ -789,10 +928,11 @@ bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
   return false;
 }
 
-#if defined(__SSE3__)
-#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
-    _mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
-    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
+#if TSAN_VECTORIZE
+#  define SHUF(v0, v1, i0, i1, i2, i3)                    \
+    _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v0), \
+                                    _mm_castsi128_ps(v1), \
+                                    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
 ALWAYS_INLINE
 bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
   // This is an optimized version of ContainsSameAccessSlow.
@@ -849,7 +989,7 @@ bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
 
 ALWAYS_INLINE
 bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-#if defined(__SSE3__)
+#if TSAN_VECTORIZE
   bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
   // NOTE: this check can fail if the shadow is concurrently mutated
   // by other threads. But it still can be useful if you modify
@@ -864,7 +1004,7 @@ bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
 ALWAYS_INLINE USED
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
-  u64 *shadow_mem = (u64*)MemToShadow(addr);
+  RawShadow *shadow_mem = MemToShadow(addr);
   DPrintf2("#%d: MemoryAccess: @%p %p size=%d"
       " is_write=%d shadow_mem=%p {%zx, %zx, %zx, %zx}\n",
       (int)thr->fast_state.tid(), (void*)pc, (void*)addr,
@@ -876,9 +1016,9 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     Printf("Access to non app mem %zx\n", addr);
     DCHECK(IsAppMem(addr));
   }
-  if (!IsShadowMem((uptr)shadow_mem)) {
+  if (!IsShadowMem(shadow_mem)) {
     Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
-    DCHECK(IsShadowMem((uptr)shadow_mem));
+    DCHECK(IsShadowMem(shadow_mem));
   }
 #endif
 
@@ -953,9 +1093,9 @@ static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
   size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
   // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
   if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
-    u64 *p = (u64*)MemToShadow(addr);
-    CHECK(IsShadowMem((uptr)p));
-    CHECK(IsShadowMem((uptr)(p + size * kShadowCnt / kShadowCell - 1)));
+    RawShadow *p = MemToShadow(addr);
+    CHECK(IsShadowMem(p));
+    CHECK(IsShadowMem(p + size * kShadowCnt / kShadowCell - 1));
     // FIXME: may overwrite a part outside the region
     for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
       p[i++] = val;
@@ -965,9 +1105,9 @@ static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
   } else {
     // The region is big, reset only beginning and end.
     const uptr kPageSize = GetPageSizeCached();
-    u64 *begin = (u64*)MemToShadow(addr);
-    u64 *end = begin + size / kShadowCell * kShadowCnt;
-    u64 *p = begin;
+    RawShadow *begin = MemToShadow(addr);
+    RawShadow *end = begin + size / kShadowCell * kShadowCnt;
+    RawShadow *p = begin;
     // Set at least first kPageSize/2 to page boundary.
     while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
       *p++ = val;
@@ -975,7 +1115,7 @@ static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
         *p++ = 0;
     }
     // Reset middle part.
-    u64 *p1 = p;
+    RawShadow *p1 = p;
     p = RoundDown(end, kPageSize);
     if (!MmapFixedSuperNoReserve((uptr)p1, (uptr)p - (uptr)p1))
       Die();
@@ -1070,18 +1210,18 @@ void FuncExit(ThreadState *thr) {
   thr->shadow_stack_pos--;
 }
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc, bool save_stack) {
+void ThreadIgnoreBegin(ThreadState *thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
   thr->ignore_reads_and_writes++;
   CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->fast_state.SetIgnoreBit();
 #if !SANITIZER_GO
-  if (save_stack && !ctx->after_multithreaded_fork)
+  if (pc && !ctx->after_multithreaded_fork)
     thr->mop_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
-void ThreadIgnoreEnd(ThreadState *thr, uptr pc) {
+void ThreadIgnoreEnd(ThreadState *thr) {
   DPrintf("#%d: ThreadIgnoreEnd\n", thr->tid);
   CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->ignore_reads_and_writes--;
@@ -1101,17 +1241,17 @@ uptr __tsan_testonly_shadow_stack_current_size() {
 }
 #endif
 
-void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc, bool save_stack) {
+void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreSyncBegin\n", thr->tid);
   thr->ignore_sync++;
   CHECK_GT(thr->ignore_sync, 0);
 #if !SANITIZER_GO
-  if (save_stack && !ctx->after_multithreaded_fork)
+  if (pc && !ctx->after_multithreaded_fork)
     thr->sync_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
-void ThreadIgnoreSyncEnd(ThreadState *thr, uptr pc) {
+void ThreadIgnoreSyncEnd(ThreadState *thr) {
   DPrintf("#%d: ThreadIgnoreSyncEnd\n", thr->tid);
   CHECK_GT(thr->ignore_sync, 0);
   thr->ignore_sync--;
@@ -1133,7 +1273,28 @@ void build_consistency_release() {}
 
 }  // namespace __tsan
 
+#if SANITIZER_CHECK_DEADLOCKS
+namespace __sanitizer {
+using namespace __tsan;
+MutexMeta mutex_meta[] = {
+    {MutexInvalid, "Invalid", {}},
+    {MutexThreadRegistry, "ThreadRegistry", {}},
+    {MutexTypeTrace, "Trace", {MutexLeaf}},
+    {MutexTypeReport, "Report", {MutexTypeSyncVar}},
+    {MutexTypeSyncVar, "SyncVar", {}},
+    {MutexTypeAnnotations, "Annotations", {}},
+    {MutexTypeAtExit, "AtExit", {MutexTypeSyncVar}},
+    {MutexTypeFired, "Fired", {MutexLeaf}},
+    {MutexTypeRacy, "Racy", {MutexLeaf}},
+    {MutexTypeGlobalProc, "GlobalProc", {}},
+    {},
+};
+
+void PrintMutexPC(uptr pc) { StackTrace(&pc, 1).Print(); }
+}  // namespace __sanitizer
+#endif
+
 #if !SANITIZER_GO
 // Must be included in this file to make sure everything is inlined.
-#include "tsan_interface_inl.h"
+#  include "tsan_interface.inc"
 #endif
diff --git a/libsanitizer/tsan/tsan_rtl.h b/libsanitizer/tsan/tsan_rtl.h
index 6576d40..4f50656 100644
--- a/libsanitizer/tsan/tsan_rtl.h
+++ b/libsanitizer/tsan/tsan_rtl.h
@@ -37,14 +37,15 @@
 #include "tsan_clock.h"
 #include "tsan_defs.h"
 #include "tsan_flags.h"
+#include "tsan_ignoreset.h"
 #include "tsan_mman.h"
-#include "tsan_sync.h"
-#include "tsan_trace.h"
-#include "tsan_report.h"
-#include "tsan_platform.h"
 #include "tsan_mutexset.h"
-#include "tsan_ignoreset.h"
+#include "tsan_platform.h"
+#include "tsan_report.h"
+#include "tsan_shadow.h"
 #include "tsan_stack_trace.h"
+#include "tsan_sync.h"
+#include "tsan_trace.h"
 
 #if SANITIZER_WORDSIZE != 64
 # error "ThreadSanitizer is supported only on 64-bit platforms"
@@ -69,6 +70,11 @@ struct AP32 {
 typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #else
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
+#    if defined(__s390x__)
+  typedef MappingS390x Mapping;
+#    else
+  typedef Mapping48AddressSpace Mapping;
+#    endif
   static const uptr kSpaceBeg = Mapping::kHeapMemBeg;
   static const uptr kSpaceSize = Mapping::kHeapMemEnd - Mapping::kHeapMemBeg;
   static const uptr kMetadataSize = 0;
@@ -84,240 +90,6 @@ typedef Allocator::AllocatorCache AllocatorCache;
 Allocator *allocator();
 #endif
 
-const u64 kShadowRodata = (u64)-1;  // .rodata shadow marker
-
-// FastState (from most significant bit):
-//   ignore          : 1
-//   tid             : kTidBits
-//   unused          : -
-//   history_size    : 3
-//   epoch           : kClkBits
-class FastState {
- public:
-  FastState(u64 tid, u64 epoch) {
-    x_ = tid << kTidShift;
-    x_ |= epoch;
-    DCHECK_EQ(tid, this->tid());
-    DCHECK_EQ(epoch, this->epoch());
-    DCHECK_EQ(GetIgnoreBit(), false);
-  }
-
-  explicit FastState(u64 x)
-      : x_(x) {
-  }
-
-  u64 raw() const {
-    return x_;
-  }
-
-  u64 tid() const {
-    u64 res = (x_ & ~kIgnoreBit) >> kTidShift;
-    return res;
-  }
-
-  u64 TidWithIgnore() const {
-    u64 res = x_ >> kTidShift;
-    return res;
-  }
-
-  u64 epoch() const {
-    u64 res = x_ & ((1ull << kClkBits) - 1);
-    return res;
-  }
-
-  void IncrementEpoch() {
-    u64 old_epoch = epoch();
-    x_ += 1;
-    DCHECK_EQ(old_epoch + 1, epoch());
-    (void)old_epoch;
-  }
-
-  void SetIgnoreBit() { x_ |= kIgnoreBit; }
-  void ClearIgnoreBit() { x_ &= ~kIgnoreBit; }
-  bool GetIgnoreBit() const { return (s64)x_ < 0; }
-
-  void SetHistorySize(int hs) {
-    CHECK_GE(hs, 0);
-    CHECK_LE(hs, 7);
-    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
-  }
-
-  ALWAYS_INLINE
-  int GetHistorySize() const {
-    return (int)((x_ >> kHistoryShift) & kHistoryMask);
-  }
-
-  void ClearHistorySize() {
-    SetHistorySize(0);
-  }
-
-  ALWAYS_INLINE
-  u64 GetTracePos() const {
-    const int hs = GetHistorySize();
-    // When hs == 0, the trace consists of 2 parts.
-    const u64 mask = (1ull << (kTracePartSizeBits + hs + 1)) - 1;
-    return epoch() & mask;
-  }
-
- private:
-  friend class Shadow;
-  static const int kTidShift = 64 - kTidBits - 1;
-  static const u64 kIgnoreBit = 1ull << 63;
-  static const u64 kFreedBit = 1ull << 63;
-  static const u64 kHistoryShift = kClkBits;
-  static const u64 kHistoryMask = 7;
-  u64 x_;
-};
-
-// Shadow (from most significant bit):
-//   freed           : 1
-//   tid             : kTidBits
-//   is_atomic       : 1
-//   is_read         : 1
-//   size_log        : 2
-//   addr0           : 3
-//   epoch           : kClkBits
-class Shadow : public FastState {
- public:
-  explicit Shadow(u64 x)
-      : FastState(x) {
-  }
-
-  explicit Shadow(const FastState &s)
-      : FastState(s.x_) {
-    ClearHistorySize();
-  }
-
-  void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
-    DCHECK_LE(addr0, 7);
-    DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
-    DCHECK_EQ(kAccessSizeLog, size_log());
-    DCHECK_EQ(addr0, this->addr0());
-  }
-
-  void SetWrite(unsigned kAccessIsWrite) {
-    DCHECK_EQ(x_ & kReadBit, 0);
-    if (!kAccessIsWrite)
-      x_ |= kReadBit;
-    DCHECK_EQ(kAccessIsWrite, IsWrite());
-  }
-
-  void SetAtomic(bool kIsAtomic) {
-    DCHECK(!IsAtomic());
-    if (kIsAtomic)
-      x_ |= kAtomicBit;
-    DCHECK_EQ(IsAtomic(), kIsAtomic);
-  }
-
-  bool IsAtomic() const {
-    return x_ & kAtomicBit;
-  }
-
-  bool IsZero() const {
-    return x_ == 0;
-  }
-
-  static inline bool TidsAreEqual(const Shadow s1, const Shadow s2) {
-    u64 shifted_xor = (s1.x_ ^ s2.x_) >> kTidShift;
-    DCHECK_EQ(shifted_xor == 0, s1.TidWithIgnore() == s2.TidWithIgnore());
-    return shifted_xor == 0;
-  }
-
-  static ALWAYS_INLINE
-  bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
-    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
-    return masked_xor == 0;
-  }
-
-  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
-      unsigned kS2AccessSize) {
-    bool res = false;
-    u64 diff = s1.addr0() - s2.addr0();
-    if ((s64)diff < 0) {  // s1.addr0 < s2.addr0
-      // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)
-        res = true;
-    } else {
-      // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff)
-        res = true;
-    }
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
-    return res;
-  }
-
-  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
-  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
-  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
-  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
-
-  // The idea behind the freed bit is as follows.
-  // When the memory is freed (or otherwise unaccessible) we write to the shadow
-  // values with tid/epoch related to the free and the freed bit set.
-  // During memory accesses processing the freed bit is considered
-  // as msb of tid. So any access races with shadow with freed bit set
-  // (it is as if write from a thread with which we never synchronized before).
-  // This allows us to detect accesses to freed memory w/o additional
-  // overheads in memory access processing and at the same time restore
-  // tid/epoch of free.
-  void MarkAsFreed() {
-     x_ |= kFreedBit;
-  }
-
-  bool IsFreed() const {
-    return x_ & kFreedBit;
-  }
-
-  bool GetFreedAndReset() {
-    bool res = x_ & kFreedBit;
-    x_ &= ~kFreedBit;
-    return res;
-  }
-
-  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift)
-        | (u64(kIsAtomic) << kAtomicShift));
-    DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
-    return v;
-  }
-
-  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3)
-        <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
-        (IsAtomic() == kIsAtomic && !IsWrite() <= !kIsWrite));
-    return v;
-  }
-
-  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3)
-        >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
-        (IsAtomic() == kIsAtomic && !IsWrite() >= !kIsWrite));
-    return v;
-  }
-
- private:
-  static const u64 kReadShift   = 5 + kClkBits;
-  static const u64 kReadBit     = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6 + kClkBits;
-  static const u64 kAtomicBit   = 1ull << kAtomicShift;
-
-  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
-
-  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
-    if (s1.addr0() == s2.addr0()) return true;
-    if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
-      return true;
-    if (s2.addr0() < s1.addr0() && s2.addr0() + s2.size() > s1.addr0())
-      return true;
-    return false;
-  }
-};
-
 struct ThreadSignalContext;
 
 struct JmpBuf {
@@ -380,27 +152,30 @@ struct ThreadState {
   // We do not distinguish beteween ignoring reads and writes
   // for better performance.
   int ignore_reads_and_writes;
+  atomic_sint32_t pending_signals;
   int ignore_sync;
   int suppress_reports;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
   IgnoreSet sync_ignore_set;
-#endif
-  // C/C++ uses fixed size shadow stack embed into Trace.
+  // C/C++ uses fixed size shadow stack.
+  uptr shadow_stack[kShadowStackSize];
+#else
   // Go uses malloc-allocated shadow stack with dynamic size.
   uptr *shadow_stack;
+#endif
   uptr *shadow_stack_end;
   uptr *shadow_stack_pos;
-  u64 *racy_shadow_addr;
-  u64 racy_state[2];
+  RawShadow *racy_shadow_addr;
+  RawShadow racy_state[2];
   MutexSet mset;
   ThreadClock clock;
 #if !SANITIZER_GO
   Vector<JmpBuf> jmp_bufs;
   int ignore_interceptors;
 #endif
-  const u32 tid;
+  const Tid tid;
   const int unique_id;
   bool in_symbolizer;
   bool in_ignored_lib;
@@ -414,9 +189,6 @@ struct ThreadState {
   const uptr tls_size;
   ThreadContext *tctx;
 
-#if SANITIZER_DEBUG && !SANITIZER_GO
-  InternalDeadlockDetector internal_deadlock_detector;
-#endif
   DDLogicalThread *dd_lt;
 
   // Current wired Processor, or nullptr. Required to handle any events.
@@ -431,7 +203,7 @@ struct ThreadState {
   ThreadSignalContext *signal_ctx;
 
 #if !SANITIZER_GO
-  u32 last_sleep_stack_id;
+  StackID last_sleep_stack_id;
   ThreadClock last_sleep_clock;
 #endif
 
@@ -441,10 +213,17 @@ struct ThreadState {
 
   const ReportDesc *current_report;
 
-  explicit ThreadState(Context *ctx, u32 tid, int unique_id, u64 epoch,
+  // Current position in tctx->trace.Back()->events (Event*).
+  atomic_uintptr_t trace_pos;
+  // PC of the last memory access, used to compute PC deltas in the trace.
+  uptr trace_prev_pc;
+  Sid sid;
+  Epoch epoch;
+
+  explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
                        unsigned reuse_count, uptr stk_addr, uptr stk_size,
                        uptr tls_addr, uptr tls_size);
-};
+} ALIGNED(SANITIZER_CACHE_LINE_SIZE);
 
 #if !SANITIZER_GO
 #if SANITIZER_MAC || SANITIZER_ANDROID
@@ -472,10 +251,10 @@ inline void cur_thread_finalize() { }
 
 class ThreadContext final : public ThreadContextBase {
  public:
-  explicit ThreadContext(int tid);
+  explicit ThreadContext(Tid tid);
   ~ThreadContext();
   ThreadState *thr;
-  u32 creation_stack_id;
+  StackID creation_stack_id;
   SyncClock sync;
   // Epoch at which the thread had started.
   // If we see an event from the thread stamped by an older epoch,
@@ -483,6 +262,8 @@ class ThreadContext final : public ThreadContextBase {
   u64 epoch0;
   u64 epoch1;
 
+  v3::Trace trace;
+
   // Override superclass callbacks.
   void OnDead() override;
   void OnJoined(void *arg) override;
@@ -495,13 +276,7 @@ class ThreadContext final : public ThreadContextBase {
 
 struct RacyStacks {
   MD5Hash hash[2];
-  bool operator==(const RacyStacks &other) const {
-    if (hash[0] == other.hash[0] && hash[1] == other.hash[1])
-      return true;
-    if (hash[0] == other.hash[1] && hash[1] == other.hash[0])
-      return true;
-    return false;
-  }
+  bool operator==(const RacyStacks &other) const;
 };
 
 struct RacyAddress {
@@ -527,13 +302,12 @@ struct Context {
 
   Mutex report_mtx;
   int nreported;
-  int nmissed_expected;
   atomic_uint64_t last_symbolize_time_ns;
 
   void *background_thread;
   atomic_uint32_t stop_background_thread;
 
-  ThreadRegistry *thread_registry;
+  ThreadRegistry thread_registry;
 
   Mutex racy_mtx;
   Vector<RacyStacks> racy_stacks;
@@ -546,9 +320,9 @@ struct Context {
   ClockAlloc clock_alloc;
 
   Flags flags;
+  fd_t memprof_fd;
 
-  u64 int_alloc_cnt[MBlockTypeCount];
-  u64 int_alloc_siz[MBlockTypeCount];
+  Mutex slot_mtx;
 };
 
 extern Context *ctx;  // The one and the only global runtime context.
@@ -581,12 +355,12 @@ class ScopedReportBase {
                        const MutexSet *mset);
   void AddStack(StackTrace stack, bool suppressable = false);
   void AddThread(const ThreadContext *tctx, bool suppressable = false);
-  void AddThread(int unique_tid, bool suppressable = false);
-  void AddUniqueTid(int unique_tid);
+  void AddThread(Tid unique_tid, bool suppressable = false);
+  void AddUniqueTid(Tid unique_tid);
   void AddMutex(const SyncVar *s);
   u64 AddMutex(u64 id);
   void AddLocation(uptr addr, uptr size);
-  void AddSleep(u32 stack_id);
+  void AddSleep(StackID stack_id);
   void SetCount(int count);
 
   const ReportDesc *GetReport() const;
@@ -618,7 +392,7 @@ class ScopedReport : public ScopedReportBase {
 
 bool ShouldReport(ThreadState *thr, ReportType typ);
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack);
-void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
+void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
                   MutexSet *mset, uptr *tag = nullptr);
 
 // The stack could look like:
@@ -671,7 +445,6 @@ void ReportRace(ThreadState *thr);
 bool OutputReport(ThreadState *thr, const ScopedReport &srep);
 bool IsFiredSuppression(Context *ctx, ReportType type, StackTrace trace);
 bool IsExpectedReport(uptr addr, uptr size);
-void PrintMatchedBenignRaces();
 
 #if defined(TSAN_DEBUG_OUTPUT) && TSAN_DEBUG_OUTPUT >= 1
 # define DPrintf Printf
@@ -685,10 +458,11 @@ void PrintMatchedBenignRaces();
 # define DPrintf2(...)
 #endif
 
-u32 CurrentStackId(ThreadState *thr, uptr pc);
-ReportStack *SymbolizeStackId(u32 stack_id);
+StackID CurrentStackId(ThreadState *thr, uptr pc);
+ReportStack *SymbolizeStackId(StackID stack_id);
 void PrintCurrentStack(ThreadState *thr, uptr pc);
 void PrintCurrentStackSlow(uptr pc);  // uses libunwind
+MBlock *JavaHeapBlock(uptr addr, uptr *start);
 
 void Initialize(ThreadState *thr);
 void MaybeSpawnBackgroundThread();
@@ -704,34 +478,44 @@ void MemoryAccessImpl(ThreadState *thr, uptr addr,
     u64 *shadow_mem, Shadow cur);
 void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
     uptr size, bool is_write);
-void MemoryAccessRangeStep(ThreadState *thr, uptr pc, uptr addr,
-    uptr size, uptr step, bool is_write);
-void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int size, bool kAccessIsWrite, bool kIsAtomic);
+void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                           AccessType typ);
 
 const int kSizeLog1 = 0;
 const int kSizeLog2 = 1;
 const int kSizeLog4 = 2;
 const int kSizeLog8 = 3;
 
-void ALWAYS_INLINE MemoryRead(ThreadState *thr, uptr pc,
-                                     uptr addr, int kAccessSizeLog) {
-  MemoryAccess(thr, pc, addr, kAccessSizeLog, false, false);
-}
-
-void ALWAYS_INLINE MemoryWrite(ThreadState *thr, uptr pc,
-                                      uptr addr, int kAccessSizeLog) {
-  MemoryAccess(thr, pc, addr, kAccessSizeLog, true, false);
-}
-
-void ALWAYS_INLINE MemoryReadAtomic(ThreadState *thr, uptr pc,
-                                           uptr addr, int kAccessSizeLog) {
-  MemoryAccess(thr, pc, addr, kAccessSizeLog, false, true);
-}
-
-void ALWAYS_INLINE MemoryWriteAtomic(ThreadState *thr, uptr pc,
-                                            uptr addr, int kAccessSizeLog) {
-  MemoryAccess(thr, pc, addr, kAccessSizeLog, true, true);
+ALWAYS_INLINE
+void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                  AccessType typ) {
+  int size_log;
+  switch (size) {
+    case 1:
+      size_log = kSizeLog1;
+      break;
+    case 2:
+      size_log = kSizeLog2;
+      break;
+    case 4:
+      size_log = kSizeLog4;
+      break;
+    default:
+      DCHECK_EQ(size, 8);
+      size_log = kSizeLog8;
+      break;
+  }
+  bool is_write = !(typ & kAccessRead);
+  bool is_atomic = typ & kAccessAtomic;
+  if (typ & kAccessVptr)
+    thr->is_vptr_access = true;
+  if (typ & kAccessFree)
+    thr->is_freeing = true;
+  MemoryAccess(thr, pc, addr, size_log, is_write, is_atomic);
+  if (typ & kAccessVptr)
+    thr->is_vptr_access = false;
+  if (typ & kAccessFree)
+    thr->is_freeing = false;
 }
 
 void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
@@ -740,26 +524,26 @@ void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                          uptr size);
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc, bool save_stack = true);
-void ThreadIgnoreEnd(ThreadState *thr, uptr pc);
-void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc, bool save_stack = true);
-void ThreadIgnoreSyncEnd(ThreadState *thr, uptr pc);
+void ThreadIgnoreBegin(ThreadState *thr, uptr pc);
+void ThreadIgnoreEnd(ThreadState *thr);
+void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
+void ThreadIgnoreSyncEnd(ThreadState *thr);
 
 void FuncEntry(ThreadState *thr, uptr pc);
 void FuncExit(ThreadState *thr);
 
-int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
-void ThreadStart(ThreadState *thr, int tid, tid_t os_id,
+Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
+void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type);
 void ThreadFinish(ThreadState *thr);
-int ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid);
-void ThreadJoin(ThreadState *thr, uptr pc, int tid);
-void ThreadDetach(ThreadState *thr, uptr pc, int tid);
+Tid ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid);
+void ThreadJoin(ThreadState *thr, uptr pc, Tid tid);
+void ThreadDetach(ThreadState *thr, uptr pc, Tid tid);
 void ThreadFinalize(ThreadState *thr);
 void ThreadSetName(ThreadState *thr, const char *name);
 int ThreadCount(ThreadState *thr);
-void ProcessPendingSignals(ThreadState *thr);
-void ThreadNotJoined(ThreadState *thr, uptr pc, int tid, uptr uid);
+void ProcessPendingSignalsImpl(ThreadState *thr);
+void ThreadNotJoined(ThreadState *thr, uptr pc, Tid tid, uptr uid);
 
 Processor *ProcCreate();
 void ProcDestroy(Processor *proc);
@@ -788,7 +572,7 @@ void Acquire(ThreadState *thr, uptr pc, uptr addr);
 // handle Go finalizers. Namely, finalizer goroutine executes AcquireGlobal
 // right before executing finalizers. This provides a coarse, but simple
 // approximation of the actual required synchronization.
-void AcquireGlobal(ThreadState *thr, uptr pc);
+void AcquireGlobal(ThreadState *thr);
 void Release(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr);
@@ -824,7 +608,7 @@ void TraceSwitch(ThreadState *thr);
 uptr TraceTopPC(ThreadState *thr);
 uptr TraceSize();
 uptr TraceParts();
-Trace *ThreadTrace(int tid);
+Trace *ThreadTrace(Tid tid);
 
 extern "C" void __tsan_trace_switch();
 void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
@@ -864,6 +648,111 @@ enum FiberSwitchFlags {
   FiberSwitchFlagNoSync = 1 << 0, // __tsan_switch_to_fiber_no_sync
 };
 
+ALWAYS_INLINE void ProcessPendingSignals(ThreadState *thr) {
+  if (UNLIKELY(atomic_load_relaxed(&thr->pending_signals)))
+    ProcessPendingSignalsImpl(thr);
+}
+
+extern bool is_initialized;
+
+ALWAYS_INLINE
+void LazyInitialize(ThreadState *thr) {
+  // If we can use .preinit_array, assume that __tsan_init
+  // called from .preinit_array initializes runtime before
+  // any instrumented code.
+#if !SANITIZER_CAN_USE_PREINIT_ARRAY
+  if (UNLIKELY(!is_initialized))
+    Initialize(thr);
+#endif
+}
+
+namespace v3 {
+
+void TraceSwitchPart(ThreadState *thr);
+bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
+                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+                  MutexSet *pmset, uptr *ptag);
+
+template <typename EventT>
+ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr,
+                                                   EventT **ev) {
+  Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
+#if SANITIZER_DEBUG
+  // TraceSwitch acquires these mutexes,
+  // so we lock them here to detect deadlocks more reliably.
+  { Lock lock(&ctx->slot_mtx); }
+  { Lock lock(&thr->tctx->trace.mtx); }
+  TracePart *current = thr->tctx->trace.parts.Back();
+  if (current) {
+    DCHECK_GE(pos, &current->events[0]);
+    DCHECK_LE(pos, &current->events[TracePart::kSize]);
+  } else {
+    DCHECK_EQ(pos, nullptr);
+  }
+#endif
+  // TracePart is allocated with mmap and is at least 4K aligned.
+  // So the following check is a faster way to check for part end.
+  // It may have false positives in the middle of the trace,
+  // they are filtered out in TraceSwitch.
+  if (UNLIKELY(((uptr)(pos + 1) & TracePart::kAlignment) == 0))
+    return false;
+  *ev = reinterpret_cast<EventT *>(pos);
+  return true;
+}
+
+template <typename EventT>
+ALWAYS_INLINE void TraceRelease(ThreadState *thr, EventT *evp) {
+  DCHECK_LE(evp + 1, &thr->tctx->trace.parts.Back()->events[TracePart::kSize]);
+  atomic_store_relaxed(&thr->trace_pos, (uptr)(evp + 1));
+}
+
+template <typename EventT>
+void TraceEvent(ThreadState *thr, EventT ev) {
+  EventT *evp;
+  if (!TraceAcquire(thr, &evp)) {
+    TraceSwitchPart(thr);
+    UNUSED bool res = TraceAcquire(thr, &evp);
+    DCHECK(res);
+  }
+  *evp = ev;
+  TraceRelease(thr, evp);
+}
+
+ALWAYS_INLINE WARN_UNUSED_RESULT bool TryTraceFunc(ThreadState *thr,
+                                                   uptr pc = 0) {
+  if (!kCollectHistory)
+    return true;
+  EventFunc *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  ev->is_access = 0;
+  ev->is_func = 1;
+  ev->pc = pc;
+  TraceRelease(thr, ev);
+  return true;
+}
+
+WARN_UNUSED_RESULT
+bool TryTraceMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                          AccessType typ);
+WARN_UNUSED_RESULT
+bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                               AccessType typ);
+void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                            AccessType typ);
+void TraceFunc(ThreadState *thr, uptr pc = 0);
+void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+                    StackID stk);
+void TraceMutexUnlock(ThreadState *thr, uptr addr);
+void TraceTime(ThreadState *thr);
+
+}  // namespace v3
+
+#if !SANITIZER_GO
+extern void (*on_initialize)(void);
+extern int (*on_finalize)(int);
+#endif
+
 }  // namespace __tsan
 
 #endif  // TSAN_RTL_H
diff --git a/libsanitizer/tsan/tsan_rtl_mutex.cpp b/libsanitizer/tsan/tsan_rtl_mutex.cpp
index a214a33..7d6b411 100644
--- a/libsanitizer/tsan/tsan_rtl_mutex.cpp
+++ b/libsanitizer/tsan/tsan_rtl_mutex.cpp
@@ -35,7 +35,7 @@ struct Callback final : public DDCallback {
     DDCallback::lt = thr->dd_lt;
   }
 
-  u32 Unwind() override { return CurrentStackId(thr, pc); }
+  StackID Unwind() override { return CurrentStackId(thr, pc); }
   int UniqueTid() override { return thr->unique_id; }
 };
 
@@ -53,7 +53,7 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
     return;
   if (!ShouldReport(thr, typ))
     return;
-  ThreadRegistryLock l(ctx->thread_registry);
+  ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(typ);
   rep.AddMutex(mid);
   VarSizeStackTrace trace;
@@ -68,46 +68,49 @@ void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   if (!(flagz & MutexFlagLinkerInit) && IsAppMem(addr)) {
     CHECK(!thr->is_freeing);
     thr->is_freeing = true;
-    MemoryWrite(thr, pc, addr, kSizeLog1);
+    MemoryAccess(thr, pc, addr, 1, kAccessWrite);
     thr->is_freeing = false;
   }
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+  Lock l(&s->mtx);
   s->SetFlags(flagz & MutexCreationFlagMask);
+  // Save stack in the case the sync object was created before as atomic.
   if (!SANITIZER_GO && s->creation_stack_id == 0)
     s->creation_stack_id = CurrentStackId(thr, pc);
-  s->mtx.Unlock();
 }
 
 void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
-  if (s == 0)
-    return;
-  if ((flagz & MutexFlagLinkerInit)
-      || s->IsFlagSet(MutexFlagLinkerInit)
-      || ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
-    // Destroy is no-op for linker-initialized mutexes.
-    s->mtx.Unlock();
-    return;
-  }
-  if (common_flags()->detect_deadlocks) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexDestroy(&cb, &s->dd);
-    ctx->dd->MutexInit(&cb, &s->dd);
-  }
   bool unlock_locked = false;
-  if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
-      !s->IsFlagSet(MutexFlagBroken)) {
-    s->SetFlags(MutexFlagBroken);
-    unlock_locked = true;
+  u64 mid = 0;
+  u64 last_lock = 0;
+  {
+    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
+    if (s == 0)
+      return;
+    Lock l(&s->mtx);
+    if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
+        ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
+      // Destroy is no-op for linker-initialized mutexes.
+      return;
+    }
+    if (common_flags()->detect_deadlocks) {
+      Callback cb(thr, pc);
+      ctx->dd->MutexDestroy(&cb, &s->dd);
+      ctx->dd->MutexInit(&cb, &s->dd);
+    }
+    if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
+        !s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
+      unlock_locked = true;
+    }
+    mid = s->GetId();
+    last_lock = s->last_lock;
+    if (!unlock_locked)
+      s->Reset(thr->proc());  // must not reset it before the report is printed
   }
-  u64 mid = s->GetId();
-  u64 last_lock = s->last_lock;
-  if (!unlock_locked)
-    s->Reset(thr->proc());  // must not reset it before the report is printed
-  s->mtx.Unlock();
   if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked)) {
-    ThreadRegistryLock l(ctx->thread_registry);
+    ThreadRegistryLock l(&ctx->thread_registry);
     ScopedReport rep(ReportTypeMutexDestroyLocked);
     rep.AddMutex(mid);
     VarSizeStackTrace trace;
@@ -119,38 +122,35 @@ void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
     rep.AddLocation(addr, 1);
     OutputReport(thr, rep);
 
-    SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
+    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
     if (s != 0) {
+      Lock l(&s->mtx);
       s->Reset(thr->proc());
-      s->mtx.Unlock();
     }
   }
   thr->mset.Remove(mid);
   // Imitate a memory write to catch unlock-destroy races.
   // Do this outside of sync mutex, because it can report a race which locks
   // sync mutexes.
-  if (IsAppMem(addr)) {
-    CHECK(!thr->is_freeing);
-    thr->is_freeing = true;
-    MemoryWrite(thr, pc, addr, kSizeLog1);
-    thr->is_freeing = false;
-  }
+  if (IsAppMem(addr))
+    MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
   // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
 void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
-    s->UpdateFlags(flagz);
-    if (s->owner_tid != thr->tid) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-      s->mtx.ReadUnlock();
-      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
-    } else {
-      s->mtx.ReadUnlock();
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    {
+      ReadLock l(&s->mtx);
+      s->UpdateFlags(flagz);
+      if (s->owner_tid != thr->tid) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
+      }
     }
+    Callback cb(thr, pc);
+    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
 }
 
@@ -162,43 +162,45 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
   else
     rec = 1;
   if (IsAppMem(addr))
-    MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  s->UpdateFlags(flagz);
-  thr->fast_state.IncrementEpoch();
-  TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
-  bool report_double_lock = false;
-  if (s->owner_tid == kInvalidTid) {
-    CHECK_EQ(s->recursion, 0);
-    s->owner_tid = thr->tid;
-    s->last_lock = thr->fast_state.raw();
-  } else if (s->owner_tid == thr->tid) {
-    CHECK_GT(s->recursion, 0);
-  } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-    s->SetFlags(MutexFlagBroken);
-    report_double_lock = true;
-  }
-  const bool first = s->recursion == 0;
-  s->recursion += rec;
-  if (first) {
-    AcquireImpl(thr, pc, &s->clock);
-    AcquireImpl(thr, pc, &s->read_clock);
-  } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
-  }
-  thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
+    MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
+  u64 mid = 0;
   bool pre_lock = false;
-  if (first && common_flags()->detect_deadlocks) {
-    pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
-        !(flagz & MutexFlagTryLock);
-    Callback cb(thr, pc);
-    if (pre_lock)
-      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-    ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+  bool first = false;
+  bool report_double_lock = false;
+  {
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    Lock l(&s->mtx);
+    s->UpdateFlags(flagz);
+    thr->fast_state.IncrementEpoch();
+    TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
+    if (s->owner_tid == kInvalidTid) {
+      CHECK_EQ(s->recursion, 0);
+      s->owner_tid = thr->tid;
+      s->last_lock = thr->fast_state.raw();
+    } else if (s->owner_tid == thr->tid) {
+      CHECK_GT(s->recursion, 0);
+    } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
+      report_double_lock = true;
+    }
+    first = s->recursion == 0;
+    s->recursion += rec;
+    if (first) {
+      AcquireImpl(thr, pc, &s->clock);
+      AcquireImpl(thr, pc, &s->read_clock);
+    } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
+    }
+    thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
+    if (first && common_flags()->detect_deadlocks) {
+      pre_lock =
+          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
+      Callback cb(thr, pc);
+      if (pre_lock)
+        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
+      ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+    }
+    mid = s->GetId();
   }
-  u64 mid = s->GetId();
-  s->mtx.Unlock();
-  // Can't touch s after this point.
-  s = 0;
   if (report_double_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
   if (first && pre_lock && common_flags()->detect_deadlocks) {
@@ -210,35 +212,37 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
 int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexUnlock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   if (IsAppMem(addr))
-    MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  thr->fast_state.IncrementEpoch();
-  TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-  int rec = 0;
+    MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
+  u64 mid = 0;
   bool report_bad_unlock = false;
-  if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
-    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_bad_unlock = true;
-    }
-  } else {
-    rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
-    s->recursion -= rec;
-    if (s->recursion == 0) {
-      s->owner_tid = kInvalidTid;
-      ReleaseStoreImpl(thr, pc, &s->clock);
+  int rec = 0;
+  {
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    Lock l(&s->mtx);
+    thr->fast_state.IncrementEpoch();
+    TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
+    if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
+      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_bad_unlock = true;
+      }
     } else {
+      rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
+      s->recursion -= rec;
+      if (s->recursion == 0) {
+        s->owner_tid = kInvalidTid;
+        ReleaseStoreImpl(thr, pc, &s->clock);
+      } else {
+      }
     }
+    thr->mset.Del(s->GetId(), true);
+    if (common_flags()->detect_deadlocks && s->recursion == 0 &&
+        !report_bad_unlock) {
+      Callback cb(thr, pc);
+      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
+    }
+    mid = s->GetId();
   }
-  thr->mset.Del(s->GetId(), true);
-  if (common_flags()->detect_deadlocks && s->recursion == 0 &&
-      !report_bad_unlock) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
-  }
-  u64 mid = s->GetId();
-  s->mtx.Unlock();
-  // Can't touch s after this point.
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
   if (common_flags()->detect_deadlocks && !report_bad_unlock) {
@@ -251,11 +255,14 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
-    s->UpdateFlags(flagz);
+    {
+      SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+      ReadLock l(&s->mtx);
+      s->UpdateFlags(flagz);
+      Callback cb(thr, pc);
+      ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
+    }
     Callback cb(thr, pc);
-    ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-    s->mtx.ReadUnlock();
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
 }
@@ -263,34 +270,35 @@ void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPostReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   if (IsAppMem(addr))
-    MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
-  s->UpdateFlags(flagz);
-  thr->fast_state.IncrementEpoch();
-  TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
+    MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
+  u64 mid = 0;
   bool report_bad_lock = false;
-  if (s->owner_tid != kInvalidTid) {
-    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_bad_lock = true;
-    }
-  }
-  AcquireImpl(thr, pc, &s->clock);
-  s->last_lock = thr->fast_state.raw();
-  thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
   bool pre_lock = false;
-  if (common_flags()->detect_deadlocks) {
-    pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
-        !(flagz & MutexFlagTryLock);
-    Callback cb(thr, pc);
-    if (pre_lock)
-      ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-    ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
+  {
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    ReadLock l(&s->mtx);
+    s->UpdateFlags(flagz);
+    thr->fast_state.IncrementEpoch();
+    TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
+    if (s->owner_tid != kInvalidTid) {
+      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_bad_lock = true;
+      }
+    }
+    AcquireImpl(thr, pc, &s->clock);
+    s->last_lock = thr->fast_state.raw();
+    thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
+    if (common_flags()->detect_deadlocks) {
+      pre_lock =
+          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
+      Callback cb(thr, pc);
+      if (pre_lock)
+        ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
+      ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
+    }
+    mid = s->GetId();
   }
-  u64 mid = s->GetId();
-  s->mtx.ReadUnlock();
-  // Can't touch s after this point.
-  s = 0;
   if (report_bad_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
   if (pre_lock  && common_flags()->detect_deadlocks) {
@@ -302,25 +310,27 @@ void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadUnlock %zx\n", thr->tid, addr);
   if (IsAppMem(addr))
-    MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  thr->fast_state.IncrementEpoch();
-  TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
+    MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
+  u64 mid = 0;
   bool report_bad_unlock = false;
-  if (s->owner_tid != kInvalidTid) {
-    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_bad_unlock = true;
+  {
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    Lock l(&s->mtx);
+    thr->fast_state.IncrementEpoch();
+    TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
+    if (s->owner_tid != kInvalidTid) {
+      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_bad_unlock = true;
+      }
     }
+    ReleaseImpl(thr, pc, &s->read_clock);
+    if (common_flags()->detect_deadlocks && s->recursion == 0) {
+      Callback cb(thr, pc);
+      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
+    }
+    mid = s->GetId();
   }
-  ReleaseImpl(thr, pc, &s->read_clock);
-  if (common_flags()->detect_deadlocks && s->recursion == 0) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
-  }
-  u64 mid = s->GetId();
-  s->mtx.Unlock();
-  // Can't touch s after this point.
   thr->mset.Del(mid, false);
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr, mid);
@@ -333,39 +343,41 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
 void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
   if (IsAppMem(addr))
-    MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  bool write = true;
+    MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
+  u64 mid = 0;
   bool report_bad_unlock = false;
-  if (s->owner_tid == kInvalidTid) {
-    // Seems to be read unlock.
-    write = false;
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
-    ReleaseImpl(thr, pc, &s->read_clock);
-  } else if (s->owner_tid == thr->tid) {
-    // Seems to be write unlock.
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-    CHECK_GT(s->recursion, 0);
-    s->recursion--;
-    if (s->recursion == 0) {
-      s->owner_tid = kInvalidTid;
-      ReleaseStoreImpl(thr, pc, &s->clock);
-    } else {
+  {
+    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    Lock l(&s->mtx);
+    bool write = true;
+    if (s->owner_tid == kInvalidTid) {
+      // Seems to be read unlock.
+      write = false;
+      thr->fast_state.IncrementEpoch();
+      TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
+      ReleaseImpl(thr, pc, &s->read_clock);
+    } else if (s->owner_tid == thr->tid) {
+      // Seems to be write unlock.
+      thr->fast_state.IncrementEpoch();
+      TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
+      CHECK_GT(s->recursion, 0);
+      s->recursion--;
+      if (s->recursion == 0) {
+        s->owner_tid = kInvalidTid;
+        ReleaseStoreImpl(thr, pc, &s->clock);
+      } else {
+      }
+    } else if (!s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
+      report_bad_unlock = true;
     }
-  } else if (!s->IsFlagSet(MutexFlagBroken)) {
-    s->SetFlags(MutexFlagBroken);
-    report_bad_unlock = true;
-  }
-  thr->mset.Del(s->GetId(), write);
-  if (common_flags()->detect_deadlocks && s->recursion == 0) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
+    thr->mset.Del(s->GetId(), write);
+    if (common_flags()->detect_deadlocks && s->recursion == 0) {
+      Callback cb(thr, pc);
+      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
+    }
+    mid = s->GetId();
   }
-  u64 mid = s->GetId();
-  s->mtx.Unlock();
-  // Can't touch s after this point.
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
   if (common_flags()->detect_deadlocks) {
@@ -376,29 +388,27 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+  Lock l(&s->mtx);
   s->owner_tid = kInvalidTid;
   s->recursion = 0;
-  s->mtx.Unlock();
 }
 
 void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexInvalidAccess %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  u64 mid = s->GetId();
-  s->mtx.Unlock();
-  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, mid);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, s->GetId());
 }
 
 void Acquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, false);
+  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
   if (!s)
     return;
+  ReadLock l(&s->mtx);
   AcquireImpl(thr, pc, &s->clock);
-  s->mtx.ReadUnlock();
 }
 
 static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
@@ -412,49 +422,48 @@ static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
   thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
 }
 
-void AcquireGlobal(ThreadState *thr, uptr pc) {
+void AcquireGlobal(ThreadState *thr) {
   DPrintf("#%d: AcquireGlobal\n", thr->tid);
   if (thr->ignore_sync)
     return;
-  ThreadRegistryLock l(ctx->thread_registry);
-  ctx->thread_registry->RunCallbackForEachThreadLocked(
-      UpdateClockCallback, thr);
+  ThreadRegistryLock l(&ctx->thread_registry);
+  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateClockCallback, thr);
 }
 
 void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+  Lock l(&s->mtx);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
   ReleaseStoreAcquireImpl(thr, pc, &s->clock);
-  s->mtx.Unlock();
 }
 
 void Release(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Release %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+  Lock l(&s->mtx);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
   ReleaseImpl(thr, pc, &s->clock);
-  s->mtx.Unlock();
 }
 
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+  Lock l(&s->mtx);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
   ReleaseStoreImpl(thr, pc, &s->clock);
-  s->mtx.Unlock();
 }
 
 #if !SANITIZER_GO
@@ -468,13 +477,13 @@ static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
 }
 
 void AfterSleep(ThreadState *thr, uptr pc) {
-  DPrintf("#%d: AfterSleep %zx\n", thr->tid);
+  DPrintf("#%d: AfterSleep\n", thr->tid);
   if (thr->ignore_sync)
     return;
   thr->last_sleep_stack_id = CurrentStackId(thr, pc);
-  ThreadRegistryLock l(ctx->thread_registry);
-  ctx->thread_registry->RunCallbackForEachThreadLocked(
-      UpdateSleepClockCallback, thr);
+  ThreadRegistryLock l(&ctx->thread_registry);
+  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateSleepClockCallback,
+                                                      thr);
 }
 #endif
 
@@ -520,7 +529,7 @@ void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   if (r == 0 || !ShouldReport(thr, ReportTypeDeadlock))
     return;
-  ThreadRegistryLock l(ctx->thread_registry);
+  ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(ReportTypeDeadlock);
   for (int i = 0; i < r->n; i++) {
     rep.AddMutex(r->loop[i].mtx_ctx0);
diff --git a/libsanitizer/tsan/tsan_rtl_ppc64.S b/libsanitizer/tsan/tsan_rtl_ppc64.S
index 9e533a7..8285e21 100644
--- a/libsanitizer/tsan/tsan_rtl_ppc64.S
+++ b/libsanitizer/tsan/tsan_rtl_ppc64.S
@@ -1,6 +1,5 @@
 #include "tsan_ppc_regs.h"
 
-        .machine altivec
         .section .text
         .hidden __tsan_setjmp
         .globl _setjmp
diff --git a/libsanitizer/tsan/tsan_rtl_report.cpp b/libsanitizer/tsan/tsan_rtl_report.cpp
index 706794f..1f0bcb3 100644
--- a/libsanitizer/tsan/tsan_rtl_report.cpp
+++ b/libsanitizer/tsan/tsan_rtl_report.cpp
@@ -68,8 +68,10 @@ static void StackStripMain(SymbolizedStack *frames) {
   } else if (last && 0 == internal_strcmp(last, "__tsan_thread_start_func")) {
     last_frame->ClearAll();
     last_frame2->next = nullptr;
-  // Strip global ctors init.
-  } else if (last && 0 == internal_strcmp(last, "__do_global_ctors_aux")) {
+    // Strip global ctors init, .preinit_array and main caller.
+  } else if (last && (0 == internal_strcmp(last, "__do_global_ctors_aux") ||
+                      0 == internal_strcmp(last, "__libc_csu_init") ||
+                      0 == internal_strcmp(last, "__libc_start_main"))) {
     last_frame->ClearAll();
     last_frame2->next = nullptr;
   // If both are 0, then we probably just failed to symbolize.
@@ -120,7 +122,7 @@ static ReportStack *SymbolizeStack(StackTrace trace) {
   }
   StackStripMain(top);
 
-  ReportStack *stack = ReportStack::New();
+  auto *stack = New<ReportStack>();
   stack->frames = top;
   return stack;
 }
@@ -129,10 +131,10 @@ bool ShouldReport(ThreadState *thr, ReportType typ) {
   // We set thr->suppress_reports in the fork context.
   // Taking any locking in the fork context can lead to deadlocks.
   // If any locks are already taken, it's too late to do this check.
-  CheckNoLocks(thr);
+  CheckedMutex::CheckNoLocks();
   // For the same reason check we didn't lock thread_registry yet.
   if (SANITIZER_DEBUG)
-    ThreadRegistryLock l(ctx->thread_registry);
+    ThreadRegistryLock l(&ctx->thread_registry);
   if (!flags()->report_bugs || thr->suppress_reports)
     return false;
   switch (typ) {
@@ -154,9 +156,8 @@ bool ShouldReport(ThreadState *thr, ReportType typ) {
 }
 
 ScopedReportBase::ScopedReportBase(ReportType typ, uptr tag) {
-  ctx->thread_registry->CheckLocked();
-  void *mem = internal_alloc(MBlockReport, sizeof(ReportDesc));
-  rep_ = new(mem) ReportDesc;
+  ctx->thread_registry.CheckLocked();
+  rep_ = New<ReportDesc>();
   rep_->typ = typ;
   rep_->tag = tag;
   ctx->report_mtx.Lock();
@@ -165,7 +166,6 @@ ScopedReportBase::ScopedReportBase(ReportType typ, uptr tag) {
 ScopedReportBase::~ScopedReportBase() {
   ctx->report_mtx.Unlock();
   DestroyAndFree(rep_);
-  rep_ = nullptr;
 }
 
 void ScopedReportBase::AddStack(StackTrace stack, bool suppressable) {
@@ -176,8 +176,7 @@ void ScopedReportBase::AddStack(StackTrace stack, bool suppressable) {
 
 void ScopedReportBase::AddMemoryAccess(uptr addr, uptr external_tag, Shadow s,
                                        StackTrace stack, const MutexSet *mset) {
-  void *mem = internal_alloc(MBlockReportMop, sizeof(ReportMop));
-  ReportMop *mop = new(mem) ReportMop;
+  auto *mop = New<ReportMop>();
   rep_->mops.PushBack(mop);
   mop->tid = s.tid();
   mop->addr = addr + s.addr0();
@@ -196,7 +195,7 @@ void ScopedReportBase::AddMemoryAccess(uptr addr, uptr external_tag, Shadow s,
   }
 }
 
-void ScopedReportBase::AddUniqueTid(int unique_tid) {
+void ScopedReportBase::AddUniqueTid(Tid unique_tid) {
   rep_->unique_tids.PushBack(unique_tid);
 }
 
@@ -205,8 +204,7 @@ void ScopedReportBase::AddThread(const ThreadContext *tctx, bool suppressable) {
     if ((u32)rep_->threads[i]->id == tctx->tid)
       return;
   }
-  void *mem = internal_alloc(MBlockReportThread, sizeof(ReportThread));
-  ReportThread *rt = new(mem) ReportThread;
+  auto *rt = New<ReportThread>();
   rep_->threads.PushBack(rt);
   rt->id = tctx->tid;
   rt->os_id = tctx->os_id;
@@ -226,17 +224,17 @@ static bool FindThreadByUidLockedCallback(ThreadContextBase *tctx, void *arg) {
   return tctx->unique_id == (u32)unique_id;
 }
 
-static ThreadContext *FindThreadByUidLocked(int unique_id) {
-  ctx->thread_registry->CheckLocked();
+static ThreadContext *FindThreadByUidLocked(Tid unique_id) {
+  ctx->thread_registry.CheckLocked();
   return static_cast<ThreadContext *>(
-      ctx->thread_registry->FindThreadContextLocked(
+      ctx->thread_registry.FindThreadContextLocked(
           FindThreadByUidLockedCallback, &unique_id));
 }
 
-static ThreadContext *FindThreadByTidLocked(int tid) {
-  ctx->thread_registry->CheckLocked();
-  return static_cast<ThreadContext*>(
-      ctx->thread_registry->GetThreadLocked(tid));
+static ThreadContext *FindThreadByTidLocked(Tid tid) {
+  ctx->thread_registry.CheckLocked();
+  return static_cast<ThreadContext *>(
+      ctx->thread_registry.GetThreadLocked(tid));
 }
 
 static bool IsInStackOrTls(ThreadContextBase *tctx_base, void *arg) {
@@ -251,10 +249,10 @@ static bool IsInStackOrTls(ThreadContextBase *tctx_base, void *arg) {
 }
 
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
-  ctx->thread_registry->CheckLocked();
-  ThreadContext *tctx = static_cast<ThreadContext*>(
-      ctx->thread_registry->FindThreadContextLocked(IsInStackOrTls,
-                                                    (void*)addr));
+  ctx->thread_registry.CheckLocked();
+  ThreadContext *tctx =
+      static_cast<ThreadContext *>(ctx->thread_registry.FindThreadContextLocked(
+          IsInStackOrTls, (void *)addr));
   if (!tctx)
     return 0;
   ThreadState *thr = tctx->thr;
@@ -264,7 +262,7 @@ ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
 }
 #endif
 
-void ScopedReportBase::AddThread(int unique_tid, bool suppressable) {
+void ScopedReportBase::AddThread(Tid unique_tid, bool suppressable) {
 #if !SANITIZER_GO
   if (const ThreadContext *tctx = FindThreadByUidLocked(unique_tid))
     AddThread(tctx, suppressable);
@@ -276,8 +274,7 @@ void ScopedReportBase::AddMutex(const SyncVar *s) {
     if (rep_->mutexes[i]->id == s->uid)
       return;
   }
-  void *mem = internal_alloc(MBlockReportMutex, sizeof(ReportMutex));
-  ReportMutex *rm = new(mem) ReportMutex;
+  auto *rm = New<ReportMutex>();
   rep_->mutexes.PushBack(rm);
   rm->id = s->uid;
   rm->addr = s->addr;
@@ -289,18 +286,17 @@ u64 ScopedReportBase::AddMutex(u64 id) {
   u64 uid = 0;
   u64 mid = id;
   uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
+  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
   // Check that the mutex is still alive.
   // Another mutex can be created at the same address,
   // so check uid as well.
   if (s && s->CheckId(uid)) {
+    Lock l(&s->mtx);
     mid = s->uid;
     AddMutex(s);
   } else {
     AddDeadMutex(id);
   }
-  if (s)
-    s->mtx.Unlock();
   return mid;
 }
 
@@ -309,8 +305,7 @@ void ScopedReportBase::AddDeadMutex(u64 id) {
     if (rep_->mutexes[i]->id == id)
       return;
   }
-  void *mem = internal_alloc(MBlockReportMutex, sizeof(ReportMutex));
-  ReportMutex *rm = new(mem) ReportMutex;
+  auto *rm = New<ReportMutex>();
   rep_->mutexes.PushBack(rm);
   rm->id = id;
   rm->addr = 0;
@@ -323,10 +318,11 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
     return;
 #if !SANITIZER_GO
   int fd = -1;
-  int creat_tid = kInvalidTid;
-  u32 creat_stack = 0;
+  Tid creat_tid = kInvalidTid;
+  StackID creat_stack = 0;
   if (FdLocation(addr, &fd, &creat_tid, &creat_stack)) {
-    ReportLocation *loc = ReportLocation::New(ReportLocationFD);
+    auto *loc = New<ReportLocation>();
+    loc->type = ReportLocationFD;
     loc->fd = fd;
     loc->tid = creat_tid;
     loc->stack = SymbolizeStackId(creat_stack);
@@ -337,15 +333,19 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
     return;
   }
   MBlock *b = 0;
+  uptr block_begin = 0;
   Allocator *a = allocator();
   if (a->PointerIsMine((void*)addr)) {
-    void *block_begin = a->GetBlockBegin((void*)addr);
+    block_begin = (uptr)a->GetBlockBegin((void *)addr);
     if (block_begin)
-      b = ctx->metamap.GetBlock((uptr)block_begin);
+      b = ctx->metamap.GetBlock(block_begin);
   }
+  if (!b)
+    b = JavaHeapBlock(addr, &block_begin);
   if (b != 0) {
     ThreadContext *tctx = FindThreadByTidLocked(b->tid);
-    ReportLocation *loc = ReportLocation::New(ReportLocationHeap);
+    auto *loc = New<ReportLocation>();
+    loc->type = ReportLocationHeap;
     loc->heap_chunk_start = (uptr)allocator()->GetBlockBegin((void *)addr);
     loc->heap_chunk_size = b->siz;
     loc->external_tag = b->tag;
@@ -358,8 +358,8 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
   }
   bool is_stack = false;
   if (ThreadContext *tctx = IsThreadStackOrTls(addr, &is_stack)) {
-    ReportLocation *loc =
-        ReportLocation::New(is_stack ? ReportLocationStack : ReportLocationTLS);
+    auto *loc = New<ReportLocation>();
+    loc->type = is_stack ? ReportLocationStack : ReportLocationTLS;
     loc->tid = tctx->tid;
     rep_->locs.PushBack(loc);
     AddThread(tctx);
@@ -373,7 +373,7 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
 }
 
 #if !SANITIZER_GO
-void ScopedReportBase::AddSleep(u32 stack_id) {
+void ScopedReportBase::AddSleep(StackID stack_id) {
   rep_->sleep = SymbolizeStackId(stack_id);
 }
 #endif
@@ -387,7 +387,7 @@ ScopedReport::ScopedReport(ReportType typ, uptr tag)
 
 ScopedReport::~ScopedReport() {}
 
-void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
+void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
                   MutexSet *mset, uptr *tag) {
   // This function restores stack trace and mutex set for the thread/epoch.
   // It does so by getting stack trace and mutex set at the beginning of
@@ -450,6 +450,234 @@ void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
   ExtractTagFromStack(stk, tag);
 }
 
+namespace v3 {
+
+// Replays the trace up to last_pos position in the last part
+// or up to the provided epoch/sid (whichever is earlier)
+// and calls the provided function f for each event.
+template <typename Func>
+void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid,
+                 Epoch epoch, Func f) {
+  TracePart *part = trace->parts.Front();
+  Sid ev_sid = kFreeSid;
+  Epoch ev_epoch = kEpochOver;
+  for (;;) {
+    DCHECK_EQ(part->trace, trace);
+    // Note: an event can't start in the last element.
+    // Since an event can take up to 2 elements,
+    // we ensure we have at least 2 before adding an event.
+    Event *end = &part->events[TracePart::kSize - 1];
+    if (part == last)
+      end = last_pos;
+    for (Event *evp = &part->events[0]; evp < end; evp++) {
+      Event *evp0 = evp;
+      if (!evp->is_access && !evp->is_func) {
+        switch (evp->type) {
+          case EventType::kTime: {
+            auto *ev = reinterpret_cast<EventTime *>(evp);
+            ev_sid = static_cast<Sid>(ev->sid);
+            ev_epoch = static_cast<Epoch>(ev->epoch);
+            if (ev_sid == sid && ev_epoch > epoch)
+              return;
+            break;
+          }
+          case EventType::kAccessExt:
+            FALLTHROUGH;
+          case EventType::kAccessRange:
+            FALLTHROUGH;
+          case EventType::kLock:
+            FALLTHROUGH;
+          case EventType::kRLock:
+            // These take 2 Event elements.
+            evp++;
+            break;
+          case EventType::kUnlock:
+            // This takes 1 Event element.
+            break;
+        }
+      }
+      CHECK_NE(ev_sid, kFreeSid);
+      CHECK_NE(ev_epoch, kEpochOver);
+      f(ev_sid, ev_epoch, evp0);
+    }
+    if (part == last)
+      return;
+    part = trace->parts.Next(part);
+    CHECK(part);
+  }
+  CHECK(0);
+}
+
+static void RestoreStackMatch(VarSizeStackTrace *pstk, MutexSet *pmset,
+                              Vector<uptr> *stack, MutexSet *mset, uptr pc,
+                              bool *found) {
+  DPrintf2("    MATCHED\n");
+  *pmset = *mset;
+  stack->PushBack(pc);
+  pstk->Init(&(*stack)[0], stack->Size());
+  stack->PopBack();
+  *found = true;
+}
+
+// Checks if addr1|size1 is fully contained in addr2|size2.
+// We check for fully contained instread of just overlapping
+// because a memory access is always traced once, but can be
+// split into multiple accesses in the shadow.
+static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2,
+                                     uptr size2) {
+  return addr1 >= addr2 && addr1 + size1 <= addr2 + size2;
+}
+
+// Replays the trace of thread tid up to the target event identified
+// by sid/epoch/addr/size/typ and restores and returns stack, mutex set
+// and tag for that event. If there are multiple such events, it returns
+// the last one. Returns false if the event is not present in the trace.
+bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
+                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+                  MutexSet *pmset, uptr *ptag) {
+  // This function restores stack trace and mutex set for the thread/epoch.
+  // It does so by getting stack trace and mutex set at the beginning of
+  // trace part, and then replaying the trace till the given epoch.
+  DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid,
+           static_cast<int>(sid), static_cast<int>(epoch), addr, size,
+           static_cast<int>(typ));
+  ctx->slot_mtx.CheckLocked();  // needed to prevent trace part recycling
+  ctx->thread_registry.CheckLocked();
+  ThreadContext *tctx =
+      static_cast<ThreadContext *>(ctx->thread_registry.GetThreadLocked(tid));
+  Trace *trace = &tctx->trace;
+  // Snapshot first/last parts and the current position in the last part.
+  TracePart *first_part;
+  TracePart *last_part;
+  Event *last_pos;
+  {
+    Lock lock(&trace->mtx);
+    first_part = trace->parts.Front();
+    if (!first_part)
+      return false;
+    last_part = trace->parts.Back();
+    last_pos = trace->final_pos;
+    if (tctx->thr)
+      last_pos = (Event *)atomic_load_relaxed(&tctx->thr->trace_pos);
+  }
+  // Too large for stack.
+  alignas(MutexSet) static char mset_storage[sizeof(MutexSet)];
+  MutexSet &mset = *new (mset_storage) MutexSet();
+  Vector<uptr> stack;
+  uptr prev_pc = 0;
+  bool found = false;
+  bool is_read = typ & kAccessRead;
+  bool is_atomic = typ & kAccessAtomic;
+  bool is_free = typ & kAccessFree;
+  TraceReplay(
+      trace, last_part, last_pos, sid, epoch,
+      [&](Sid ev_sid, Epoch ev_epoch, Event *evp) {
+        bool match = ev_sid == sid && ev_epoch == epoch;
+        if (evp->is_access) {
+          if (evp->is_func == 0 && evp->type == EventType::kAccessExt &&
+              evp->_ == 0)  // NopEvent
+            return;
+          auto *ev = reinterpret_cast<EventAccess *>(evp);
+          uptr ev_addr = RestoreAddr(ev->addr);
+          uptr ev_size = 1 << ev->size_log;
+          uptr ev_pc =
+              prev_pc + ev->pc_delta - (1 << (EventAccess::kPCBits - 1));
+          prev_pc = ev_pc;
+          DPrintf2("  Access: pc=0x%zx addr=0x%zx/%zu type=%u/%u\n", ev_pc,
+                   ev_addr, ev_size, ev->is_read, ev->is_atomic);
+          if (match && type == EventType::kAccessExt &&
+              IsWithinAccess(addr, size, ev_addr, ev_size) &&
+              is_read == ev->is_read && is_atomic == ev->is_atomic && !is_free)
+            RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+          return;
+        }
+        if (evp->is_func) {
+          auto *ev = reinterpret_cast<EventFunc *>(evp);
+          if (ev->pc) {
+            DPrintf2("  FuncEnter: pc=0x%llx\n", ev->pc);
+            stack.PushBack(ev->pc);
+          } else {
+            DPrintf2("  FuncExit\n");
+            CHECK(stack.Size());
+            stack.PopBack();
+          }
+          return;
+        }
+        switch (evp->type) {
+          case EventType::kAccessExt: {
+            auto *ev = reinterpret_cast<EventAccessExt *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_size = 1 << ev->size_log;
+            prev_pc = ev->pc;
+            DPrintf2("  AccessExt: pc=0x%llx addr=0x%zx/%zu type=%u/%u\n",
+                     ev->pc, ev_addr, ev_size, ev->is_read, ev->is_atomic);
+            if (match && type == EventType::kAccessExt &&
+                IsWithinAccess(addr, size, ev_addr, ev_size) &&
+                is_read == ev->is_read && is_atomic == ev->is_atomic &&
+                !is_free)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev->pc, &found);
+            break;
+          }
+          case EventType::kAccessRange: {
+            auto *ev = reinterpret_cast<EventAccessRange *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_size =
+                (ev->size_hi << EventAccessRange::kSizeLoBits) + ev->size_lo;
+            uptr ev_pc = RestoreAddr(ev->pc);
+            prev_pc = ev_pc;
+            DPrintf2("  Range: pc=0x%zx addr=0x%zx/%zu type=%u/%u\n", ev_pc,
+                     ev_addr, ev_size, ev->is_read, ev->is_free);
+            if (match && type == EventType::kAccessExt &&
+                IsWithinAccess(addr, size, ev_addr, ev_size) &&
+                is_read == ev->is_read && !is_atomic && is_free == ev->is_free)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+            break;
+          }
+          case EventType::kLock:
+            FALLTHROUGH;
+          case EventType::kRLock: {
+            auto *ev = reinterpret_cast<EventLock *>(evp);
+            bool is_write = ev->type == EventType::kLock;
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_pc = RestoreAddr(ev->pc);
+            StackID stack_id =
+                (ev->stack_hi << EventLock::kStackIDLoBits) + ev->stack_lo;
+            DPrintf2("  Lock: pc=0x%zx addr=0x%zx stack=%u write=%d\n", ev_pc,
+                     ev_addr, stack_id, is_write);
+            mset.AddAddr(ev_addr, stack_id, is_write);
+            // Events with ev_pc == 0 are written to the beginning of trace
+            // part as initial mutex set (are not real).
+            if (match && type == EventType::kLock && addr == ev_addr && ev_pc)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+            break;
+          }
+          case EventType::kUnlock: {
+            auto *ev = reinterpret_cast<EventUnlock *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            DPrintf2("  Unlock: addr=0x%zx\n", ev_addr);
+            mset.DelAddr(ev_addr);
+            break;
+          }
+          case EventType::kTime:
+            // TraceReplay already extracted sid/epoch from it,
+            // nothing else to do here.
+            break;
+        }
+      });
+  ExtractTagFromStack(pstk, ptag);
+  return found;
+}
+
+}  // namespace v3
+
+bool RacyStacks::operator==(const RacyStacks &other) const {
+  if (hash[0] == other.hash[0] && hash[1] == other.hash[1])
+    return true;
+  if (hash[0] == other.hash[1] && hash[1] == other.hash[0])
+    return true;
+  return false;
+}
+
 static bool FindRacyStacks(const RacyStacks &hash) {
   for (uptr i = 0; i < ctx->racy_stacks.Size(); i++) {
     if (hash == ctx->racy_stacks[i]) {
@@ -596,7 +824,7 @@ static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
 }
 
 void ReportRace(ThreadState *thr) {
-  CheckNoLocks(thr);
+  CheckedMutex::CheckNoLocks();
 
   // Symbolizer makes lots of intercepted calls. If we try to process them,
   // at best it will cause deadlocks on internal mutexes.
@@ -614,7 +842,7 @@ void ReportRace(ThreadState *thr) {
     thr->racy_state[1] = s.raw();
   }
 
-  uptr addr = ShadowToMem((uptr)thr->racy_shadow_addr);
+  uptr addr = ShadowToMem(thr->racy_shadow_addr);
   uptr addr_min = 0;
   uptr addr_max = 0;
   {
@@ -692,7 +920,7 @@ void ReportRace(ThreadState *thr) {
     }
   }
 
-  ThreadRegistryLock l0(ctx->thread_registry);
+  ThreadRegistryLock l0(&ctx->thread_registry);
   ScopedReport rep(typ, tag);
   for (uptr i = 0; i < kMop; i++) {
     Shadow s(thr->racy_state[i]);
@@ -702,8 +930,8 @@ void ReportRace(ThreadState *thr) {
 
   for (uptr i = 0; i < kMop; i++) {
     FastState s(thr->racy_state[i]);
-    ThreadContext *tctx = static_cast<ThreadContext*>(
-        ctx->thread_registry->GetThreadLocked(s.tid()));
+    ThreadContext *tctx = static_cast<ThreadContext *>(
+        ctx->thread_registry.GetThreadLocked(s.tid()));
     if (s.epoch() < tctx->epoch0 || s.epoch() > tctx->epoch1)
       continue;
     rep.AddThread(tctx);
@@ -738,9 +966,7 @@ void PrintCurrentStack(ThreadState *thr, uptr pc) {
 ALWAYS_INLINE USED void PrintCurrentStackSlow(uptr pc) {
 #if !SANITIZER_GO
   uptr bp = GET_CURRENT_FRAME();
-  BufferedStackTrace *ptrace =
-      new(internal_alloc(MBlockStackTrace, sizeof(BufferedStackTrace)))
-          BufferedStackTrace();
+  auto *ptrace = New<BufferedStackTrace>();
   ptrace->Unwind(pc, bp, nullptr, false);
 
   for (uptr i = 0; i < ptrace->size / 2; i++) {
diff --git a/libsanitizer/tsan/tsan_rtl_thread.cpp b/libsanitizer/tsan/tsan_rtl_thread.cpp
index cdb6e60..61133a4 100644
--- a/libsanitizer/tsan/tsan_rtl_thread.cpp
+++ b/libsanitizer/tsan/tsan_rtl_thread.cpp
@@ -21,48 +21,14 @@ namespace __tsan {
 
 // ThreadContext implementation.
 
-ThreadContext::ThreadContext(int tid)
-  : ThreadContextBase(tid)
-  , thr()
-  , sync()
-  , epoch0()
-  , epoch1() {
-}
+ThreadContext::ThreadContext(Tid tid)
+    : ThreadContextBase(tid), thr(), sync(), epoch0(), epoch1() {}
 
 #if !SANITIZER_GO
 ThreadContext::~ThreadContext() {
 }
 #endif
 
-void ThreadContext::OnDead() {
-  CHECK_EQ(sync.size(), 0);
-}
-
-void ThreadContext::OnJoined(void *arg) {
-  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
-  AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset(&caller_thr->proc()->clock_cache);
-}
-
-struct OnCreatedArgs {
-  ThreadState *thr;
-  uptr pc;
-};
-
-void ThreadContext::OnCreated(void *arg) {
-  thr = 0;
-  if (tid == kMainTid)
-    return;
-  OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
-  if (!args->thr)  // GCD workers don't have a parent thread.
-    return;
-  args->thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
-  ReleaseImpl(args->thr, 0, &sync);
-  creation_stack_id = CurrentStackId(args->thr, args->pc);
-}
-
 void ThreadContext::OnReset() {
   CHECK_EQ(sync.size(), 0);
   uptr trace_p = GetThreadTrace(tid);
@@ -70,94 +36,15 @@ void ThreadContext::OnReset() {
   //!!! ReleaseMemoryToOS(GetThreadTraceHeader(tid), sizeof(Trace));
 }
 
-void ThreadContext::OnDetached(void *arg) {
-  ThreadState *thr1 = static_cast<ThreadState*>(arg);
-  sync.Reset(&thr1->proc()->clock_cache);
-}
-
-struct OnStartedArgs {
-  ThreadState *thr;
-  uptr stk_addr;
-  uptr stk_size;
-  uptr tls_addr;
-  uptr tls_size;
-};
-
-void ThreadContext::OnStarted(void *arg) {
-  OnStartedArgs *args = static_cast<OnStartedArgs*>(arg);
-  thr = args->thr;
-  // RoundUp so that one trace part does not contain events
-  // from different threads.
-  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
-  epoch1 = (u64)-1;
-  new(thr) ThreadState(ctx, tid, unique_id, epoch0, reuse_count,
-      args->stk_addr, args->stk_size, args->tls_addr, args->tls_size);
-#if !SANITIZER_GO
-  thr->shadow_stack = &ThreadTrace(thr->tid)->shadow_stack[0];
-  thr->shadow_stack_pos = thr->shadow_stack;
-  thr->shadow_stack_end = thr->shadow_stack + kShadowStackSize;
-#else
-  // Setup dynamic shadow stack.
-  const int kInitStackSize = 8;
-  thr->shadow_stack = (uptr*)internal_alloc(MBlockShadowStack,
-      kInitStackSize * sizeof(uptr));
-  thr->shadow_stack_pos = thr->shadow_stack;
-  thr->shadow_stack_end = thr->shadow_stack + kInitStackSize;
-#endif
-  if (common_flags()->detect_deadlocks)
-    thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
-  thr->fast_state.SetHistorySize(flags()->history_size);
-  // Commit switch to the new part of the trace.
-  // TraceAddEvent will reset stack0/mset0 in the new part for us.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-  thr->fast_synch_epoch = epoch0;
-  AcquireImpl(thr, 0, &sync);
-  sync.Reset(&thr->proc()->clock_cache);
-  thr->is_inited = true;
-  DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
-          "tls_addr=%zx tls_size=%zx\n",
-          tid, (uptr)epoch0, args->stk_addr, args->stk_size,
-          args->tls_addr, args->tls_size);
-}
-
-void ThreadContext::OnFinished() {
-#if SANITIZER_GO
-  internal_free(thr->shadow_stack);
-  thr->shadow_stack = nullptr;
-  thr->shadow_stack_pos = nullptr;
-  thr->shadow_stack_end = nullptr;
-#endif
-  if (!detached) {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    ReleaseImpl(thr, 0, &sync);
-  }
-  epoch1 = thr->fast_state.epoch();
-
-  if (common_flags()->detect_deadlocks)
-    ctx->dd->DestroyLogicalThread(thr->dd_lt);
-  thr->clock.ResetCached(&thr->proc()->clock_cache);
-#if !SANITIZER_GO
-  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
-#endif
-#if !SANITIZER_GO
-  PlatformCleanUpThreadState(thr);
-#endif
-  thr->~ThreadState();
-  thr = 0;
-}
-
 #if !SANITIZER_GO
 struct ThreadLeak {
   ThreadContext *tctx;
   int count;
 };
 
-static void MaybeReportThreadLeak(ThreadContextBase *tctx_base, void *arg) {
-  Vector<ThreadLeak> &leaks = *(Vector<ThreadLeak>*)arg;
-  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+static void CollectThreadLeaks(ThreadContextBase *tctx_base, void *arg) {
+  auto &leaks = *static_cast<Vector<ThreadLeak> *>(arg);
+  auto *tctx = static_cast<ThreadContext *>(tctx_base);
   if (tctx->detached || tctx->status != ThreadStatusFinished)
     return;
   for (uptr i = 0; i < leaks.Size(); i++) {
@@ -166,8 +53,7 @@ static void MaybeReportThreadLeak(ThreadContextBase *tctx_base, void *arg) {
       return;
     }
   }
-  ThreadLeak leak = {tctx, 1};
-  leaks.PushBack(leak);
+  leaks.PushBack({tctx, 1});
 }
 #endif
 
@@ -206,10 +92,10 @@ void ThreadFinalize(ThreadState *thr) {
 #if !SANITIZER_GO
   if (!ShouldReport(thr, ReportTypeThreadLeak))
     return;
-  ThreadRegistryLock l(ctx->thread_registry);
+  ThreadRegistryLock l(&ctx->thread_registry);
   Vector<ThreadLeak> leaks;
-  ctx->thread_registry->RunCallbackForEachThreadLocked(
-      MaybeReportThreadLeak, &leaks);
+  ctx->thread_registry.RunCallbackForEachThreadLocked(CollectThreadLeaks,
+                                                      &leaks);
   for (uptr i = 0; i < leaks.Size(); i++) {
     ScopedReport rep(ReportTypeThreadLeak);
     rep.AddThread(leaks[i].tctx, true);
@@ -221,20 +107,48 @@ void ThreadFinalize(ThreadState *thr) {
 
 int ThreadCount(ThreadState *thr) {
   uptr result;
-  ctx->thread_registry->GetNumberOfThreads(0, 0, &result);
+  ctx->thread_registry.GetNumberOfThreads(0, 0, &result);
   return (int)result;
 }
 
-int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
+struct OnCreatedArgs {
+  ThreadState *thr;
+  uptr pc;
+};
+
+Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
   OnCreatedArgs args = { thr, pc };
   u32 parent_tid = thr ? thr->tid : kInvalidTid;  // No parent for GCD workers.
-  int tid =
-      ctx->thread_registry->CreateThread(uid, detached, parent_tid, &args);
+  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent_tid, &args);
   DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent_tid, tid, uid);
   return tid;
 }
 
-void ThreadStart(ThreadState *thr, int tid, tid_t os_id,
+void ThreadContext::OnCreated(void *arg) {
+  thr = 0;
+  if (tid == kMainTid)
+    return;
+  OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
+  if (!args->thr)  // GCD workers don't have a parent thread.
+    return;
+  args->thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
+  ReleaseImpl(args->thr, 0, &sync);
+  creation_stack_id = CurrentStackId(args->thr, args->pc);
+}
+
+extern "C" void __tsan_stack_initialization() {}
+
+struct OnStartedArgs {
+  ThreadState *thr;
+  uptr stk_addr;
+  uptr stk_size;
+  uptr tls_addr;
+  uptr tls_size;
+};
+
+void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type) {
   uptr stk_addr = 0;
   uptr stk_size = 0;
@@ -244,22 +158,13 @@ void ThreadStart(ThreadState *thr, int tid, tid_t os_id,
   if (thread_type != ThreadType::Fiber)
     GetThreadStackAndTls(tid == kMainTid, &stk_addr, &stk_size, &tls_addr,
                          &tls_size);
-
-  if (tid != kMainTid) {
-    if (stk_addr && stk_size)
-      MemoryRangeImitateWrite(thr, /*pc=*/ 1, stk_addr, stk_size);
-
-    if (tls_addr && tls_size) ImitateTlsWrite(thr, tls_addr, tls_size);
-  }
 #endif
 
-  ThreadRegistry *tr = ctx->thread_registry;
+  ThreadRegistry *tr = &ctx->thread_registry;
   OnStartedArgs args = { thr, stk_addr, stk_size, tls_addr, tls_size };
   tr->StartThread(tid, os_id, thread_type, &args);
 
-  tr->Lock();
-  thr->tctx = (ThreadContext*)tr->GetThreadLocked(tid);
-  tr->Unlock();
+  while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack();
 
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
@@ -268,6 +173,51 @@ void ThreadStart(ThreadState *thr, int tid, tid_t os_id,
     ThreadIgnoreSyncBegin(thr, 0);
   }
 #endif
+
+#if !SANITIZER_GO
+  // Don't imitate stack/TLS writes for the main thread,
+  // because its initialization is synchronized with all
+  // subsequent threads anyway.
+  if (tid != kMainTid) {
+    if (stk_addr && stk_size) {
+      const uptr pc = StackTrace::GetNextInstructionPc(
+          reinterpret_cast<uptr>(__tsan_stack_initialization));
+      MemoryRangeImitateWrite(thr, pc, stk_addr, stk_size);
+    }
+
+    if (tls_addr && tls_size)
+      ImitateTlsWrite(thr, tls_addr, tls_size);
+  }
+#endif
+}
+
+void ThreadContext::OnStarted(void *arg) {
+  OnStartedArgs *args = static_cast<OnStartedArgs *>(arg);
+  thr = args->thr;
+  // RoundUp so that one trace part does not contain events
+  // from different threads.
+  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
+  epoch1 = (u64)-1;
+  new (thr)
+      ThreadState(ctx, tid, unique_id, epoch0, reuse_count, args->stk_addr,
+                  args->stk_size, args->tls_addr, args->tls_size);
+  if (common_flags()->detect_deadlocks)
+    thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
+  thr->fast_state.SetHistorySize(flags()->history_size);
+  // Commit switch to the new part of the trace.
+  // TraceAddEvent will reset stack0/mset0 in the new part for us.
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+
+  thr->fast_synch_epoch = epoch0;
+  AcquireImpl(thr, 0, &sync);
+  sync.Reset(&thr->proc()->clock_cache);
+  thr->tctx = this;
+  thr->is_inited = true;
+  DPrintf(
+      "#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
+      "tls_addr=%zx tls_size=%zx\n",
+      tid, (uptr)epoch0, args->stk_addr, args->stk_size, args->tls_addr,
+      args->tls_size);
 }
 
 void ThreadFinish(ThreadState *thr) {
@@ -277,7 +227,34 @@ void ThreadFinish(ThreadState *thr) {
   if (thr->tls_addr && thr->tls_size)
     DontNeedShadowFor(thr->tls_addr, thr->tls_size);
   thr->is_dead = true;
-  ctx->thread_registry->FinishThread(thr->tid);
+  ctx->thread_registry.FinishThread(thr->tid);
+}
+
+void ThreadContext::OnFinished() {
+#if SANITIZER_GO
+  Free(thr->shadow_stack);
+  thr->shadow_stack_pos = nullptr;
+  thr->shadow_stack_end = nullptr;
+#endif
+  if (!detached) {
+    thr->fast_state.IncrementEpoch();
+    // Can't increment epoch w/o writing to the trace as well.
+    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+    ReleaseImpl(thr, 0, &sync);
+  }
+  epoch1 = thr->fast_state.epoch();
+
+  if (common_flags()->detect_deadlocks)
+    ctx->dd->DestroyLogicalThread(thr->dd_lt);
+  thr->clock.ResetCached(&thr->proc()->clock_cache);
+#if !SANITIZER_GO
+  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
+#endif
+#if !SANITIZER_GO
+  PlatformCleanUpThreadState(thr);
+#endif
+  thr->~ThreadState();
+  thr = 0;
 }
 
 struct ConsumeThreadContext {
@@ -302,35 +279,48 @@ static bool ConsumeThreadByUid(ThreadContextBase *tctx, void *arg) {
   return false;
 }
 
-int ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid) {
+Tid ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid) {
   ConsumeThreadContext findCtx = {uid, nullptr};
-  ctx->thread_registry->FindThread(ConsumeThreadByUid, &findCtx);
-  int tid = findCtx.tctx ? findCtx.tctx->tid : kInvalidTid;
+  ctx->thread_registry.FindThread(ConsumeThreadByUid, &findCtx);
+  Tid tid = findCtx.tctx ? findCtx.tctx->tid : kInvalidTid;
   DPrintf("#%d: ThreadTid uid=%zu tid=%d\n", thr->tid, uid, tid);
   return tid;
 }
 
-void ThreadJoin(ThreadState *thr, uptr pc, int tid) {
+void ThreadJoin(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
   DPrintf("#%d: ThreadJoin tid=%d\n", thr->tid, tid);
-  ctx->thread_registry->JoinThread(tid, thr);
+  ctx->thread_registry.JoinThread(tid, thr);
 }
 
-void ThreadDetach(ThreadState *thr, uptr pc, int tid) {
+void ThreadContext::OnJoined(void *arg) {
+  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
+  AcquireImpl(caller_thr, 0, &sync);
+  sync.Reset(&caller_thr->proc()->clock_cache);
+}
+
+void ThreadContext::OnDead() { CHECK_EQ(sync.size(), 0); }
+
+void ThreadDetach(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
-  ctx->thread_registry->DetachThread(tid, thr);
+  ctx->thread_registry.DetachThread(tid, thr);
+}
+
+void ThreadContext::OnDetached(void *arg) {
+  ThreadState *thr1 = static_cast<ThreadState *>(arg);
+  sync.Reset(&thr1->proc()->clock_cache);
 }
 
-void ThreadNotJoined(ThreadState *thr, uptr pc, int tid, uptr uid) {
+void ThreadNotJoined(ThreadState *thr, uptr pc, Tid tid, uptr uid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
-  ctx->thread_registry->SetThreadUserId(tid, uid);
+  ctx->thread_registry.SetThreadUserId(tid, uid);
 }
 
 void ThreadSetName(ThreadState *thr, const char *name) {
-  ctx->thread_registry->SetThreadName(thr->tid, name);
+  ctx->thread_registry.SetThreadName(thr->tid, name);
 }
 
 void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
@@ -338,7 +328,7 @@ void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
   if (size == 0)
     return;
 
-  u64 *shadow_mem = (u64*)MemToShadow(addr);
+  RawShadow *shadow_mem = MemToShadow(addr);
   DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_write=%d\n",
       thr->tid, (void*)pc, (void*)addr,
       (int)size, is_write);
@@ -352,14 +342,14 @@ void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
     Printf("Access to non app mem %zx\n", addr + size - 1);
     DCHECK(IsAppMem(addr + size - 1));
   }
-  if (!IsShadowMem((uptr)shadow_mem)) {
+  if (!IsShadowMem(shadow_mem)) {
     Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
-    DCHECK(IsShadowMem((uptr)shadow_mem));
+    DCHECK(IsShadowMem(shadow_mem));
   }
-  if (!IsShadowMem((uptr)(shadow_mem + size * kShadowCnt / 8 - 1))) {
+  if (!IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1)) {
     Printf("Bad shadow addr %p (%zx)\n",
                shadow_mem + size * kShadowCnt / 8 - 1, addr + size - 1);
-    DCHECK(IsShadowMem((uptr)(shadow_mem + size * kShadowCnt / 8 - 1)));
+    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1));
   }
 #endif
 
@@ -421,10 +411,10 @@ void FiberSwitchImpl(ThreadState *from, ThreadState *to) {
 }
 
 ThreadState *FiberCreate(ThreadState *thr, uptr pc, unsigned flags) {
-  void *mem = internal_alloc(MBlockThreadContex, sizeof(ThreadState));
+  void *mem = Alloc(sizeof(ThreadState));
   ThreadState *fiber = static_cast<ThreadState *>(mem);
   internal_memset(fiber, 0, sizeof(*fiber));
-  int tid = ThreadCreate(thr, pc, 0, true);
+  Tid tid = ThreadCreate(thr, pc, 0, true);
   FiberSwitchImpl(thr, fiber);
   ThreadStart(fiber, tid, 0, ThreadType::Fiber);
   FiberSwitchImpl(fiber, thr);
@@ -435,7 +425,7 @@ void FiberDestroy(ThreadState *thr, uptr pc, ThreadState *fiber) {
   FiberSwitchImpl(thr, fiber);
   ThreadFinish(fiber);
   FiberSwitchImpl(fiber, thr);
-  internal_free(fiber);
+  Free(fiber);
 }
 
 void FiberSwitch(ThreadState *thr, uptr pc,
diff --git a/libsanitizer/tsan/tsan_shadow.h b/libsanitizer/tsan/tsan_shadow.h
new file mode 100644
index 0000000..8b7bc34
--- /dev/null
+++ b/libsanitizer/tsan/tsan_shadow.h
@@ -0,0 +1,233 @@
+//===-- tsan_shadow.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TSAN_SHADOW_H
+#define TSAN_SHADOW_H
+
+#include "tsan_defs.h"
+#include "tsan_trace.h"
+
+namespace __tsan {
+
+// FastState (from most significant bit):
+//   ignore          : 1
+//   tid             : kTidBits
+//   unused          : -
+//   history_size    : 3
+//   epoch           : kClkBits
+class FastState {
+ public:
+  FastState(u64 tid, u64 epoch) {
+    x_ = tid << kTidShift;
+    x_ |= epoch;
+    DCHECK_EQ(tid, this->tid());
+    DCHECK_EQ(epoch, this->epoch());
+    DCHECK_EQ(GetIgnoreBit(), false);
+  }
+
+  explicit FastState(u64 x) : x_(x) {}
+
+  u64 raw() const { return x_; }
+
+  u64 tid() const {
+    u64 res = (x_ & ~kIgnoreBit) >> kTidShift;
+    return res;
+  }
+
+  u64 TidWithIgnore() const {
+    u64 res = x_ >> kTidShift;
+    return res;
+  }
+
+  u64 epoch() const {
+    u64 res = x_ & ((1ull << kClkBits) - 1);
+    return res;
+  }
+
+  void IncrementEpoch() {
+    u64 old_epoch = epoch();
+    x_ += 1;
+    DCHECK_EQ(old_epoch + 1, epoch());
+    (void)old_epoch;
+  }
+
+  void SetIgnoreBit() { x_ |= kIgnoreBit; }
+  void ClearIgnoreBit() { x_ &= ~kIgnoreBit; }
+  bool GetIgnoreBit() const { return (s64)x_ < 0; }
+
+  void SetHistorySize(int hs) {
+    CHECK_GE(hs, 0);
+    CHECK_LE(hs, 7);
+    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
+  }
+
+  ALWAYS_INLINE
+  int GetHistorySize() const {
+    return (int)((x_ >> kHistoryShift) & kHistoryMask);
+  }
+
+  void ClearHistorySize() { SetHistorySize(0); }
+
+  ALWAYS_INLINE
+  u64 GetTracePos() const {
+    const int hs = GetHistorySize();
+    // When hs == 0, the trace consists of 2 parts.
+    const u64 mask = (1ull << (kTracePartSizeBits + hs + 1)) - 1;
+    return epoch() & mask;
+  }
+
+ private:
+  friend class Shadow;
+  static const int kTidShift = 64 - kTidBits - 1;
+  static const u64 kIgnoreBit = 1ull << 63;
+  static const u64 kFreedBit = 1ull << 63;
+  static const u64 kHistoryShift = kClkBits;
+  static const u64 kHistoryMask = 7;
+  u64 x_;
+};
+
+// Shadow (from most significant bit):
+//   freed           : 1
+//   tid             : kTidBits
+//   is_atomic       : 1
+//   is_read         : 1
+//   size_log        : 2
+//   addr0           : 3
+//   epoch           : kClkBits
+class Shadow : public FastState {
+ public:
+  explicit Shadow(u64 x) : FastState(x) {}
+
+  explicit Shadow(const FastState &s) : FastState(s.x_) { ClearHistorySize(); }
+
+  void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
+    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
+    DCHECK_LE(addr0, 7);
+    DCHECK_LE(kAccessSizeLog, 3);
+    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
+    DCHECK_EQ(kAccessSizeLog, size_log());
+    DCHECK_EQ(addr0, this->addr0());
+  }
+
+  void SetWrite(unsigned kAccessIsWrite) {
+    DCHECK_EQ(x_ & kReadBit, 0);
+    if (!kAccessIsWrite)
+      x_ |= kReadBit;
+    DCHECK_EQ(kAccessIsWrite, IsWrite());
+  }
+
+  void SetAtomic(bool kIsAtomic) {
+    DCHECK(!IsAtomic());
+    if (kIsAtomic)
+      x_ |= kAtomicBit;
+    DCHECK_EQ(IsAtomic(), kIsAtomic);
+  }
+
+  bool IsAtomic() const { return x_ & kAtomicBit; }
+
+  bool IsZero() const { return x_ == 0; }
+
+  static inline bool TidsAreEqual(const Shadow s1, const Shadow s2) {
+    u64 shifted_xor = (s1.x_ ^ s2.x_) >> kTidShift;
+    DCHECK_EQ(shifted_xor == 0, s1.TidWithIgnore() == s2.TidWithIgnore());
+    return shifted_xor == 0;
+  }
+
+  static ALWAYS_INLINE bool Addr0AndSizeAreEqual(const Shadow s1,
+                                                 const Shadow s2) {
+    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
+    return masked_xor == 0;
+  }
+
+  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
+                                               unsigned kS2AccessSize) {
+    bool res = false;
+    u64 diff = s1.addr0() - s2.addr0();
+    if ((s64)diff < 0) {  // s1.addr0 < s2.addr0
+      // if (s1.addr0() + size1) > s2.addr0()) return true;
+      if (s1.size() > -diff)
+        res = true;
+    } else {
+      // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
+      if (kS2AccessSize > diff)
+        res = true;
+    }
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
+    return res;
+  }
+
+  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
+  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
+  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
+  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
+
+  // The idea behind the freed bit is as follows.
+  // When the memory is freed (or otherwise unaccessible) we write to the shadow
+  // values with tid/epoch related to the free and the freed bit set.
+  // During memory accesses processing the freed bit is considered
+  // as msb of tid. So any access races with shadow with freed bit set
+  // (it is as if write from a thread with which we never synchronized before).
+  // This allows us to detect accesses to freed memory w/o additional
+  // overheads in memory access processing and at the same time restore
+  // tid/epoch of free.
+  void MarkAsFreed() { x_ |= kFreedBit; }
+
+  bool IsFreed() const { return x_ & kFreedBit; }
+
+  bool GetFreedAndReset() {
+    bool res = x_ & kFreedBit;
+    x_ &= ~kFreedBit;
+    return res;
+  }
+
+  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
+    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift) |
+                   (u64(kIsAtomic) << kAtomicShift));
+    DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
+    return v;
+  }
+
+  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
+    bool v = ((x_ >> kReadShift) & 3) <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
+    DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
+                     (IsAtomic() == kIsAtomic && !IsWrite() <= !kIsWrite));
+    return v;
+  }
+
+  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
+    bool v = ((x_ >> kReadShift) & 3) >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
+    DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
+                     (IsAtomic() == kIsAtomic && !IsWrite() >= !kIsWrite));
+    return v;
+  }
+
+ private:
+  static const u64 kReadShift = 5 + kClkBits;
+  static const u64 kReadBit = 1ull << kReadShift;
+  static const u64 kAtomicShift = 6 + kClkBits;
+  static const u64 kAtomicBit = 1ull << kAtomicShift;
+
+  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
+
+  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
+    if (s1.addr0() == s2.addr0())
+      return true;
+    if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
+      return true;
+    if (s2.addr0() < s1.addr0() && s2.addr0() + s2.size() > s1.addr0())
+      return true;
+    return false;
+  }
+};
+
+const RawShadow kShadowRodata = (RawShadow)-1;  // .rodata shadow marker
+
+}  // namespace __tsan
+
+#endif
diff --git a/libsanitizer/tsan/tsan_stack_trace.cpp b/libsanitizer/tsan/tsan_stack_trace.cpp
index 6c703d7..9bbaafb 100644
--- a/libsanitizer/tsan/tsan_stack_trace.cpp
+++ b/libsanitizer/tsan/tsan_stack_trace.cpp
@@ -23,14 +23,10 @@ VarSizeStackTrace::~VarSizeStackTrace() {
 }
 
 void VarSizeStackTrace::ResizeBuffer(uptr new_size) {
-  if (trace_buffer) {
-    internal_free(trace_buffer);
-  }
-  trace_buffer =
-      (new_size > 0)
-          ? (uptr *)internal_alloc(MBlockStackTrace,
-                                   new_size * sizeof(trace_buffer[0]))
-          : nullptr;
+  Free(trace_buffer);
+  trace_buffer = (new_size > 0)
+                     ? (uptr *)Alloc(new_size * sizeof(trace_buffer[0]))
+                     : nullptr;
   trace = trace_buffer;
   size = new_size;
 }
diff --git a/libsanitizer/tsan/tsan_symbolize.cpp b/libsanitizer/tsan/tsan_symbolize.cpp
index 6478f3a..2e2744d 100644
--- a/libsanitizer/tsan/tsan_symbolize.cpp
+++ b/libsanitizer/tsan/tsan_symbolize.cpp
@@ -110,7 +110,8 @@ ReportLocation *SymbolizeData(uptr addr) {
   DataInfo info;
   if (!Symbolizer::GetOrInit()->SymbolizeData(addr, &info))
     return 0;
-  ReportLocation *ent = ReportLocation::New(ReportLocationGlobal);
+  auto *ent = New<ReportLocation>();
+  ent->type = ReportLocationGlobal;
   internal_memcpy(&ent->global, &info, sizeof(info));
   return ent;
 }
diff --git a/libsanitizer/tsan/tsan_sync.cpp b/libsanitizer/tsan/tsan_sync.cpp
index d25434a..f042aba 100644
--- a/libsanitizer/tsan/tsan_sync.cpp
+++ b/libsanitizer/tsan/tsan_sync.cpp
@@ -20,13 +20,14 @@ void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);
 
 SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(0); }
 
-void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
+void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid,
+                   bool save_stack) {
   this->addr = addr;
   this->uid = uid;
   this->next = 0;
 
-  creation_stack_id = 0;
-  if (!SANITIZER_GO)  // Go does not use them
+  creation_stack_id = kInvalidStackID;
+  if (save_stack && !SANITIZER_GO)  // Go does not use them
     creation_stack_id = CurrentStackId(thr, pc);
   if (common_flags()->detect_deadlocks)
     DDMutexInit(thr, pc, this);
@@ -34,7 +35,7 @@ void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
 
 void SyncVar::Reset(Processor *proc) {
   uid = 0;
-  creation_stack_id = 0;
+  creation_stack_id = kInvalidStackID;
   owner_tid = kInvalidTid;
   last_lock = 0;
   recursion = 0;
@@ -190,63 +191,41 @@ MBlock* MetaMap::GetBlock(uptr p) {
   }
 }
 
-SyncVar* MetaMap::GetOrCreateAndLock(ThreadState *thr, uptr pc,
-                              uptr addr, bool write_lock) {
-  return GetAndLock(thr, pc, addr, write_lock, true);
-}
-
-SyncVar* MetaMap::GetIfExistsAndLock(uptr addr, bool write_lock) {
-  return GetAndLock(0, 0, addr, write_lock, false);
-}
-
-SyncVar* MetaMap::GetAndLock(ThreadState *thr, uptr pc,
-                             uptr addr, bool write_lock, bool create) {
+SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
+                          bool save_stack) {
   u32 *meta = MemToMeta(addr);
   u32 idx0 = *meta;
   u32 myidx = 0;
-  SyncVar *mys = 0;
+  SyncVar *mys = nullptr;
   for (;;) {
-    u32 idx = idx0;
-    for (;;) {
-      if (idx == 0)
-        break;
-      if (idx & kFlagBlock)
-        break;
+    for (u32 idx = idx0; idx && !(idx & kFlagBlock);) {
       DCHECK(idx & kFlagSync);
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
-      if (s->addr == addr) {
-        if (myidx != 0) {
+      if (LIKELY(s->addr == addr)) {
+        if (UNLIKELY(myidx != 0)) {
           mys->Reset(thr->proc());
           sync_alloc_.Free(&thr->proc()->sync_cache, myidx);
         }
-        if (write_lock)
-          s->mtx.Lock();
-        else
-          s->mtx.ReadLock();
         return s;
       }
       idx = s->next;
     }
     if (!create)
-      return 0;
-    if (*meta != idx0) {
+      return nullptr;
+    if (UNLIKELY(*meta != idx0)) {
       idx0 = *meta;
       continue;
     }
 
-    if (myidx == 0) {
+    if (LIKELY(myidx == 0)) {
       const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
       myidx = sync_alloc_.Alloc(&thr->proc()->sync_cache);
       mys = sync_alloc_.Map(myidx);
-      mys->Init(thr, pc, addr, uid);
+      mys->Init(thr, pc, addr, uid, save_stack);
     }
     mys->next = idx0;
     if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
         myidx | kFlagSync, memory_order_release)) {
-      if (write_lock)
-        mys->mtx.Lock();
-      else
-        mys->mtx.ReadLock();
       return mys;
     }
   }
@@ -290,4 +269,11 @@ void MetaMap::OnProcIdle(Processor *proc) {
   sync_alloc_.FlushCache(&proc->sync_cache);
 }
 
+MetaMap::MemoryStats MetaMap::GetMemoryStats() const {
+  MemoryStats stats;
+  stats.mem_block = block_alloc_.AllocatedMemory();
+  stats.sync_obj = sync_alloc_.AllocatedMemory();
+  return stats;
+}
+
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_sync.h b/libsanitizer/tsan/tsan_sync.h
index c4056f6..fc8fa28 100644
--- a/libsanitizer/tsan/tsan_sync.h
+++ b/libsanitizer/tsan/tsan_sync.h
@@ -17,7 +17,6 @@
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
 #include "tsan_defs.h"
 #include "tsan_clock.h"
-#include "tsan_mutex.h"
 #include "tsan_dense_alloc.h"
 
 namespace __tsan {
@@ -47,14 +46,16 @@ enum MutexFlags {
                                  MutexFlagNotStatic,
 };
 
+// SyncVar is a descriptor of a user synchronization object
+// (mutex or an atomic variable).
 struct SyncVar {
   SyncVar();
 
   uptr addr;  // overwritten by DenseSlabAlloc freelist
   Mutex mtx;
   u64 uid;  // Globally unique id.
-  u32 creation_stack_id;
-  u32 owner_tid;  // Set only by exclusive owners.
+  StackID creation_stack_id;
+  Tid owner_tid;  // Set only by exclusive owners.
   u64 last_lock;
   int recursion;
   atomic_uint32_t flags;
@@ -65,7 +66,7 @@ struct SyncVar {
   // with the mtx. This reduces contention for hot sync objects.
   SyncClock clock;
 
-  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
+  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid, bool save_stack);
   void Reset(Processor *proc);
 
   u64 GetId() const {
@@ -102,10 +103,8 @@ struct SyncVar {
   }
 };
 
-/* MetaMap allows to map arbitrary user pointers onto various descriptors.
-   Currently it maps pointers to heap block descriptors and sync var descs.
-   It uses 1/2 direct shadow, see tsan_platform.h.
-*/
+// MetaMap maps app addresses to heap block (MBlock) and sync var (SyncVar)
+// descriptors. It uses 1/2 direct shadow, see tsan_platform.h for the mapping.
 class MetaMap {
  public:
   MetaMap();
@@ -116,14 +115,25 @@ class MetaMap {
   void ResetRange(Processor *proc, uptr p, uptr sz);
   MBlock* GetBlock(uptr p);
 
-  SyncVar* GetOrCreateAndLock(ThreadState *thr, uptr pc,
-                              uptr addr, bool write_lock);
-  SyncVar* GetIfExistsAndLock(uptr addr, bool write_lock);
+  SyncVar *GetSyncOrCreate(ThreadState *thr, uptr pc, uptr addr,
+                           bool save_stack) {
+    return GetSync(thr, pc, addr, true, save_stack);
+  }
+  SyncVar *GetSyncIfExists(uptr addr) {
+    return GetSync(nullptr, 0, addr, false, false);
+  }
 
   void MoveMemory(uptr src, uptr dst, uptr sz);
 
   void OnProcIdle(Processor *proc);
 
+  struct MemoryStats {
+    uptr mem_block;
+    uptr sync_obj;
+  };
+
+  MemoryStats GetMemoryStats() const;
+
  private:
   static const u32 kFlagMask  = 3u << 30;
   static const u32 kFlagBlock = 1u << 30;
@@ -134,8 +144,8 @@ class MetaMap {
   SyncAlloc sync_alloc_;
   atomic_uint64_t uid_gen_;
 
-  SyncVar* GetAndLock(ThreadState *thr, uptr pc, uptr addr, bool write_lock,
-                      bool create);
+  SyncVar *GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
+                   bool save_stack);
 };
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_trace.h b/libsanitizer/tsan/tsan_trace.h
index 9f2677b..a771ad9 100644
--- a/libsanitizer/tsan/tsan_trace.h
+++ b/libsanitizer/tsan/tsan_trace.h
@@ -13,9 +13,9 @@
 #define TSAN_TRACE_H
 
 #include "tsan_defs.h"
-#include "tsan_mutex.h"
-#include "tsan_stack_trace.h"
+#include "tsan_ilist.h"
 #include "tsan_mutexset.h"
+#include "tsan_stack_trace.h"
 
 namespace __tsan {
 
@@ -68,6 +68,155 @@ struct Trace {
   Trace() : mtx(MutexTypeTrace) {}
 };
 
+namespace v3 {
+
+enum class EventType : u64 {
+  kAccessExt,
+  kAccessRange,
+  kLock,
+  kRLock,
+  kUnlock,
+  kTime,
+};
+
+// "Base" type for all events for type dispatch.
+struct Event {
+  // We use variable-length type encoding to give more bits to some event
+  // types that need them. If is_access is set, this is EventAccess.
+  // Otherwise, if is_func is set, this is EventFunc.
+  // Otherwise type denotes the type.
+  u64 is_access : 1;
+  u64 is_func : 1;
+  EventType type : 3;
+  u64 _ : 59;
+};
+static_assert(sizeof(Event) == 8, "bad Event size");
+
+// Nop event used as padding and does not affect state during replay.
+static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0};
+
+// Compressed memory access can represent only some events with PCs
+// close enough to each other. Otherwise we fall back to EventAccessExt.
+struct EventAccess {
+  static constexpr uptr kPCBits = 15;
+
+  u64 is_access : 1;  // = 1
+  u64 is_read : 1;
+  u64 is_atomic : 1;
+  u64 size_log : 2;
+  u64 pc_delta : kPCBits;  // signed delta from the previous memory access PC
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventAccess) == 8, "bad EventAccess size");
+
+// Function entry (pc != 0) or exit (pc == 0).
+struct EventFunc {
+  u64 is_access : 1;  // = 0
+  u64 is_func : 1;    // = 1
+  u64 pc : 62;
+};
+static_assert(sizeof(EventFunc) == 8, "bad EventFunc size");
+
+// Extended memory access with full PC.
+struct EventAccessExt {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kAccessExt
+  u64 is_read : 1;
+  u64 is_atomic : 1;
+  u64 size_log : 2;
+  u64 _ : 11;
+  u64 addr : kCompressedAddrBits;
+  u64 pc;
+};
+static_assert(sizeof(EventAccessExt) == 16, "bad EventAccessExt size");
+
+// Access to a memory range.
+struct EventAccessRange {
+  static constexpr uptr kSizeLoBits = 13;
+
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kAccessRange
+  u64 is_read : 1;
+  u64 is_free : 1;
+  u64 size_lo : kSizeLoBits;
+  u64 pc : kCompressedAddrBits;
+  u64 addr : kCompressedAddrBits;
+  u64 size_hi : 64 - kCompressedAddrBits;
+};
+static_assert(sizeof(EventAccessRange) == 16, "bad EventAccessRange size");
+
+// Mutex lock.
+struct EventLock {
+  static constexpr uptr kStackIDLoBits = 15;
+
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kLock or EventType::kRLock
+  u64 pc : kCompressedAddrBits;
+  u64 stack_lo : kStackIDLoBits;
+  u64 stack_hi : sizeof(StackID) * kByteBits - kStackIDLoBits;
+  u64 _ : 3;
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventLock) == 16, "bad EventLock size");
+
+// Mutex unlock.
+struct EventUnlock {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kUnlock
+  u64 _ : 15;
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventUnlock) == 8, "bad EventUnlock size");
+
+// Time change event.
+struct EventTime {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kTime
+  u64 sid : sizeof(Sid) * kByteBits;
+  u64 epoch : kEpochBits;
+  u64 _ : 64 - 5 - sizeof(Sid) * kByteBits - kEpochBits;
+};
+static_assert(sizeof(EventTime) == 8, "bad EventTime size");
+
+struct Trace;
+
+struct TraceHeader {
+  Trace* trace = nullptr;  // back-pointer to Trace containing this part
+  INode trace_parts;       // in Trace::parts
+};
+
+struct TracePart : TraceHeader {
+  static constexpr uptr kByteSize = 256 << 10;
+  static constexpr uptr kSize =
+      (kByteSize - sizeof(TraceHeader)) / sizeof(Event);
+  // TraceAcquire does a fast event pointer overflow check by comparing
+  // pointer into TracePart::events with kAlignment mask. Since TracePart's
+  // are allocated page-aligned, this check detects end of the array
+  // (it also have false positives in the middle that are filtered separately).
+  // This also requires events to be the last field.
+  static constexpr uptr kAlignment = 0xff0;
+  Event events[kSize];
+
+  TracePart() {}
+};
+static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size");
+
+struct Trace {
+  Mutex mtx;
+  IList<TraceHeader, &TraceHeader::trace_parts, TracePart> parts;
+  Event* final_pos =
+      nullptr;  // final position in the last part for finished threads
+
+  Trace() : mtx(MutexTypeTrace) {}
+};
+
+}  // namespace v3
+
 }  // namespace __tsan
 
 #endif  // TSAN_TRACE_H
diff --git a/libsanitizer/tsan/tsan_update_shadow_word_inl.h b/libsanitizer/tsan/tsan_update_shadow_word.inc
index d23dfb0..a58ef0f 100644
--- a/libsanitizer/tsan/tsan_update_shadow_word_inl.h
+++ b/libsanitizer/tsan/tsan_update_shadow_word.inc
@@ -1,4 +1,4 @@
-//===-- tsan_update_shadow_word_inl.h ---------------------------*- C++ -*-===//
+//===-- tsan_update_shadow_word.inc -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libsanitizer/tsan/tsan_vector_clock.cpp b/libsanitizer/tsan/tsan_vector_clock.cpp
new file mode 100644
index 0000000..2782985
--- /dev/null
+++ b/libsanitizer/tsan/tsan_vector_clock.cpp
@@ -0,0 +1,126 @@
+//===-- tsan_vector_clock.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#include "tsan_vector_clock.h"
+
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "tsan_mman.h"
+
+namespace __tsan {
+
+#if TSAN_VECTORIZE
+const uptr kVectorClockSize = kThreadSlotCount * sizeof(Epoch) / sizeof(m128);
+#endif
+
+VectorClock::VectorClock() { Reset(); }
+
+void VectorClock::Reset() {
+#if !TSAN_VECTORIZE
+  for (uptr i = 0; i < kThreadSlotCount; i++)
+    clk_[i] = kEpochZero;
+#else
+  m128 z = _mm_setzero_si128();
+  m128* vclk = reinterpret_cast<m128*>(clk_);
+  for (uptr i = 0; i < kVectorClockSize; i++) _mm_store_si128(&vclk[i], z);
+#endif
+}
+
+void VectorClock::Acquire(const VectorClock* src) {
+  if (!src)
+    return;
+#if !TSAN_VECTORIZE
+  for (uptr i = 0; i < kThreadSlotCount; i++)
+    clk_[i] = max(clk_[i], src->clk_[i]);
+#else
+  m128* __restrict vdst = reinterpret_cast<m128*>(clk_);
+  m128 const* __restrict vsrc = reinterpret_cast<m128 const*>(src->clk_);
+  for (uptr i = 0; i < kVectorClockSize; i++) {
+    m128 s = _mm_load_si128(&vsrc[i]);
+    m128 d = _mm_load_si128(&vdst[i]);
+    m128 m = _mm_max_epu16(s, d);
+    _mm_store_si128(&vdst[i], m);
+  }
+#endif
+}
+
+static VectorClock* AllocClock(VectorClock** dstp) {
+  if (UNLIKELY(!*dstp))
+    *dstp = New<VectorClock>();
+  return *dstp;
+}
+
+void VectorClock::Release(VectorClock** dstp) const {
+  VectorClock* dst = AllocClock(dstp);
+  dst->Acquire(this);
+}
+
+void VectorClock::ReleaseStore(VectorClock** dstp) const {
+  VectorClock* dst = AllocClock(dstp);
+  *dst = *this;
+}
+
+VectorClock& VectorClock::operator=(const VectorClock& other) {
+#if !TSAN_VECTORIZE
+  for (uptr i = 0; i < kThreadSlotCount; i++)
+    clk_[i] = other.clk_[i];
+#else
+  m128* __restrict vdst = reinterpret_cast<m128*>(clk_);
+  m128 const* __restrict vsrc = reinterpret_cast<m128 const*>(other.clk_);
+  for (uptr i = 0; i < kVectorClockSize; i++) {
+    m128 s = _mm_load_si128(&vsrc[i]);
+    _mm_store_si128(&vdst[i], s);
+  }
+#endif
+  return *this;
+}
+
+void VectorClock::ReleaseStoreAcquire(VectorClock** dstp) {
+  VectorClock* dst = AllocClock(dstp);
+#if !TSAN_VECTORIZE
+  for (uptr i = 0; i < kThreadSlotCount; i++) {
+    Epoch tmp = dst->clk_[i];
+    dst->clk_[i] = clk_[i];
+    clk_[i] = max(clk_[i], tmp);
+  }
+#else
+  m128* __restrict vdst = reinterpret_cast<m128*>(dst->clk_);
+  m128* __restrict vclk = reinterpret_cast<m128*>(clk_);
+  for (uptr i = 0; i < kVectorClockSize; i++) {
+    m128 t = _mm_load_si128(&vdst[i]);
+    m128 c = _mm_load_si128(&vclk[i]);
+    m128 m = _mm_max_epu16(c, t);
+    _mm_store_si128(&vdst[i], c);
+    _mm_store_si128(&vclk[i], m);
+  }
+#endif
+}
+
+void VectorClock::ReleaseAcquire(VectorClock** dstp) {
+  VectorClock* dst = AllocClock(dstp);
+#if !TSAN_VECTORIZE
+  for (uptr i = 0; i < kThreadSlotCount; i++) {
+    dst->clk_[i] = max(dst->clk_[i], clk_[i]);
+    clk_[i] = dst->clk_[i];
+  }
+#else
+  m128* __restrict vdst = reinterpret_cast<m128*>(dst->clk_);
+  m128* __restrict vclk = reinterpret_cast<m128*>(clk_);
+  for (uptr i = 0; i < kVectorClockSize; i++) {
+    m128 c = _mm_load_si128(&vclk[i]);
+    m128 d = _mm_load_si128(&vdst[i]);
+    m128 m = _mm_max_epu16(c, d);
+    _mm_store_si128(&vdst[i], m);
+    _mm_store_si128(&vclk[i], m);
+  }
+#endif
+}
+
+}  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_vector_clock.h b/libsanitizer/tsan/tsan_vector_clock.h
new file mode 100644
index 0000000..63b2063
--- /dev/null
+++ b/libsanitizer/tsan/tsan_vector_clock.h
@@ -0,0 +1,51 @@
+//===-- tsan_vector_clock.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_VECTOR_CLOCK_H
+#define TSAN_VECTOR_CLOCK_H
+
+#include "tsan_defs.h"
+
+namespace __tsan {
+
+// Fixed-size vector clock, used both for threads and sync objects.
+class VectorClock {
+ public:
+  VectorClock();
+
+  Epoch Get(Sid sid) const;
+  void Set(Sid sid, Epoch v);
+
+  void Reset();
+  void Acquire(const VectorClock* src);
+  void Release(VectorClock** dstp) const;
+  void ReleaseStore(VectorClock** dstp) const;
+  void ReleaseStoreAcquire(VectorClock** dstp);
+  void ReleaseAcquire(VectorClock** dstp);
+
+  VectorClock& operator=(const VectorClock& other);
+
+ private:
+  Epoch clk_[kThreadSlotCount] VECTOR_ALIGNED;
+};
+
+ALWAYS_INLINE Epoch VectorClock::Get(Sid sid) const {
+  return clk_[static_cast<u8>(sid)];
+}
+
+ALWAYS_INLINE void VectorClock::Set(Sid sid, Epoch v) {
+  DCHECK_GE(v, clk_[static_cast<u8>(sid)]);
+  clk_[static_cast<u8>(sid)] = v;
+}
+
+}  // namespace __tsan
+
+#endif  // TSAN_VECTOR_CLOCK_H
author	H.J. Lu <hjl.tools@gmail.com>	2021-09-27 10:43:33 -0700
committer	H.J. Lu <hjl.tools@gmail.com>	2021-10-01 09:02:54 -0700
commit	76288e1c5da5a34e3c13d37ac4cab41e0f46ff61 (patch)
tree	91841423d03755f702c6a60401338e06c08c8017 /libsanitizer/tsan
parent	7c99923f8c544ec07109e8333acb2c2388c38a1b (diff)
download	gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.zip gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.tar.gz gcc-76288e1c5da5a34e3c13d37ac4cab41e0f46ff61.tar.bz2