From 5ac29058f01502c407a7b77ec57a08c96941f796 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 9 Nov 2017 21:56:59 +0000
Subject: sync/atomic, runtime/internal/atomic: don't assume reads from 0 fail

    For a misaligned address force a panic rather than assuming that reading
    from the address 0 will cause one.

    Reviewed-on: https://go-review.googlesource.com/69850

From-SVN: r254610
---
 libgo/go/runtime/internal/atomic/atomic.c | 14 +++++++-------
 libgo/go/runtime/panic.go                 |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/internal/atomic/atomic.c b/libgo/go/runtime/internal/atomic/atomic.c
index b584656..24820f2 100644
--- a/libgo/go/runtime/internal/atomic/atomic.c
+++ b/libgo/go/runtime/internal/atomic/atomic.c
@@ -34,7 +34,7 @@ uint64_t
 Load64 (uint64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
 }
 
@@ -66,7 +66,7 @@ int64_t
 Loadint64 (int64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
 }
 
@@ -88,7 +88,7 @@ uint64_t
 Xadd64 (uint64_t *ptr, int64_t delta)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_add_fetch (ptr, (uint64_t) delta, __ATOMIC_SEQ_CST);
 }
 
@@ -110,7 +110,7 @@ int64_t
 Xaddint64 (int64_t *ptr, int64_t delta)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_add_fetch (ptr, delta, __ATOMIC_SEQ_CST);
 }
 
@@ -132,7 +132,7 @@ uint64_t
 Xchg64 (uint64_t *ptr, uint64_t new)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_exchange_n (ptr, new, __ATOMIC_SEQ_CST);
 }
 
@@ -184,7 +184,7 @@ _Bool
 Cas64 (uint64_t *ptr, uint64_t old, uint64_t new)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   return __atomic_compare_exchange_n (ptr, &old, new, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
@@ -226,7 +226,7 @@ void
 Store64 (uint64_t *ptr, uint64_t val)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    ptr = NULL;
+    panicmem ();
   __atomic_store_n (ptr, val, __ATOMIC_SEQ_CST);
 }
 
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 2f65603..c39a58d 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -22,6 +22,7 @@ import (
 //go:linkname makefuncreturning runtime.makefuncreturning
 //go:linkname gorecover runtime.gorecover
 //go:linkname deferredrecover runtime.deferredrecover
+//go:linkname panicmem runtime.panicmem
 // Temporary for C code to call:
 //go:linkname throw runtime.throw
 
-- 
cgit v1.1


From 79c9f76563e0c0943c9dc44bd1b892175c3239b9 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Sat, 2 Dec 2017 00:46:00 +0000
Subject: runtime: export cgoCheck functions

    The functions cgoCheckPointer and cgoCheckResult are called by code
    generated by cgo. That means that we need to export them using
    go:linkname, as otherwise they are local symbols. The cgo code
    currently uses weak references to only call the symbols if they are
    defined, which is why it has been working--the cgo code has not been
    doing any checks.

    Reviewed-on: https://go-review.googlesource.com/80295

From-SVN: r255347
---
 libgo/go/runtime/cgocall.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 2e0e591..4a416fb 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -11,6 +11,10 @@ import (
 	"unsafe"
 )
 
+// Functions called by cgo-generated code.
+//go:linkname cgoCheckPointer runtime.cgoCheckPointer
+//go:linkname cgoCheckResult runtime.cgoCheckResult
+
 // Pointer checking for cgo code.
 
 // We want to detect all cases where a program that does not use
-- 
cgit v1.1


From 1a2f01efa63036a5104f203a4789e682c0e0915d Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 9 Jan 2018 01:23:08 +0000
Subject: libgo: update to Go1.10beta1

    Update the Go library to the 1.10beta1 release.

    Requires a few changes to the compiler for modifications to the map
    runtime code, and to handle some nowritebarrier cases in the runtime.

    Reviewed-on: https://go-review.googlesource.com/86455

gotools/:
	* Makefile.am (go_cmd_vet_files): New variable.
	(go_cmd_buildid_files, go_cmd_test2json_files): New variables.
	(s-zdefaultcc): Change from constants to functions.
	(noinst_PROGRAMS): Add vet, buildid, and test2json.
	(cgo$(EXEEXT)): Link against $(LIBGOTOOL).
	(vet$(EXEEXT)): New target.
	(buildid$(EXEEXT)): New target.
	(test2json$(EXEEXT)): New target.
	(install-exec-local): Install all $(noinst_PROGRAMS).
	(uninstall-local): Uninstasll all $(noinst_PROGRAMS).
	(check-go-tool): Depend on $(noinst_PROGRAMS).  Copy down
	objabi.go.
	(check-runtime): Depend on $(noinst_PROGRAMS).
	(check-cgo-test, check-carchive-test): Likewise.
	(check-vet): New target.
	(check): Depend on check-vet.  Look at cmd_vet-testlog.
	(.PHONY): Add check-vet.
	* Makefile.in: Rebuild.

From-SVN: r256365
---
 libgo/go/runtime/alg.go                            |   9 +-
 libgo/go/runtime/append_test.go                    |  78 +-
 libgo/go/runtime/cgo_gccgo.go                      |  16 +-
 libgo/go/runtime/cgocall.go                        |   6 +-
 libgo/go/runtime/cgocheck.go                       |   4 +
 libgo/go/runtime/chan.go                           |  74 +-
 libgo/go/runtime/chan_test.go                      | 110 +++
 libgo/go/runtime/cpuprof.go                        |   1 +
 libgo/go/runtime/cputicks.go                       |   2 +-
 libgo/go/runtime/crash_cgo_test.go                 |  92 +-
 libgo/go/runtime/crash_test.go                     |  47 +-
 libgo/go/runtime/crash_unix_test.go                |  11 +-
 libgo/go/runtime/debug.go                          |   3 -
 libgo/go/runtime/export_test.go                    |  43 +-
 libgo/go/runtime/extern.go                         |   4 +-
 libgo/go/runtime/gc_test.go                        | 143 +++-
 libgo/go/runtime/hash32.go                         |  26 +
 libgo/go/runtime/hash64.go                         |  22 +
 libgo/go/runtime/hash_test.go                      |  34 +
 libgo/go/runtime/hashmap.go                        | 695 ++++++++-------
 libgo/go/runtime/hashmap_fast.go                   | 927 +++++++++++++++------
 libgo/go/runtime/heapdump.go                       |  11 -
 libgo/go/runtime/internal/atomic/atomic_test.go    |   2 +-
 libgo/go/runtime/internal/sys/sys.go               |   4 +-
 libgo/go/runtime/lock_sema.go                      |   8 +-
 libgo/go/runtime/malloc.go                         |  82 +-
 libgo/go/runtime/malloc_test.go                    |   5 +-
 libgo/go/runtime/map_test.go                       | 222 ++++-
 libgo/go/runtime/mbarrier.go                       |  58 +-
 libgo/go/runtime/mbitmap.go                        |  26 +-
 libgo/go/runtime/mcache.go                         |   3 +-
 libgo/go/runtime/mem_gccgo.go                      |  54 +-
 libgo/go/runtime/memmove_test.go                   |   9 +
 libgo/go/runtime/mfinal.go                         |  15 +-
 libgo/go/runtime/mfinal_test.go                    |  21 +
 libgo/go/runtime/mgc.go                            | 322 ++++---
 libgo/go/runtime/mgc_gccgo.go                      |  23 +-
 libgo/go/runtime/mgclarge.go                       |   4 +-
 libgo/go/runtime/mgcmark.go                        |  47 +-
 libgo/go/runtime/mgcwork.go                        |  56 +-
 libgo/go/runtime/mheap.go                          |  81 +-
 libgo/go/runtime/mksizeclasses.go                  |  13 +-
 libgo/go/runtime/mstats.go                         |  16 +-
 libgo/go/runtime/mwbbuf.go                         | 248 ++++++
 libgo/go/runtime/netpoll_kqueue.go                 |  19 +-
 libgo/go/runtime/netpoll_windows.go                |   2 +-
 libgo/go/runtime/os_freebsd.go                     |  11 +
 libgo/go/runtime/os_linux.go                       |  71 +-
 libgo/go/runtime/os_linux_ppc64x.go                |  53 +-
 libgo/go/runtime/os_netbsd.go                      |  16 +-
 libgo/go/runtime/panic.go                          |  27 +-
 libgo/go/runtime/pprof/pprof.go                    |  94 ++-
 libgo/go/runtime/pprof/pprof_test.go               | 291 +++++--
 libgo/go/runtime/pprof/proto.go                    |   4 +-
 libgo/go/runtime/print.go                          |  18 +-
 libgo/go/runtime/proc.go                           | 641 ++++++++++----
 libgo/go/runtime/proc_runtime_test.go              |   2 -
 libgo/go/runtime/proc_test.go                      | 153 +++-
 libgo/go/runtime/runtime-lldb_test.go              |  84 +-
 libgo/go/runtime/runtime.go                        |   6 +
 libgo/go/runtime/runtime1.go                       |  11 -
 libgo/go/runtime/runtime2.go                       | 141 ++--
 libgo/go/runtime/runtime_mmap_test.go              |  29 +-
 libgo/go/runtime/runtime_test.go                   |   9 +-
 libgo/go/runtime/rwmutex_test.go                   |   5 +
 libgo/go/runtime/select.go                         |  76 +-
 libgo/go/runtime/sema.go                           |   5 +-
 libgo/go/runtime/signal_gccgo.go                   |   5 -
 libgo/go/runtime/signal_sighandler.go              |   6 +-
 libgo/go/runtime/signal_unix.go                    |  95 ++-
 libgo/go/runtime/sigqueue.go                       |  35 +-
 libgo/go/runtime/sizeclasses.go                    | 134 +--
 libgo/go/runtime/slice.go                          |  42 +-
 libgo/go/runtime/string.go                         |   6 +-
 libgo/go/runtime/stubs.go                          |  29 +-
 libgo/go/runtime/stubs2.go                         |   7 +
 libgo/go/runtime/testdata/testprog/gc.go           |   3 +
 libgo/go/runtime/testdata/testprog/gettid.go       |  29 +
 libgo/go/runtime/testdata/testprog/gettid_none.go  |  15 +
 libgo/go/runtime/testdata/testprog/lockosthread.go |  94 +++
 .../runtime/testdata/testprog/syscall_windows.go   |  45 +-
 libgo/go/runtime/testdata/testprogcgo/callback.go  |   6 +-
 .../go/runtime/testdata/testprogcgo/catchpanic.go  |  46 +
 libgo/go/runtime/testdata/testprogcgo/cgo.go       |   6 +-
 .../go/runtime/testdata/testprogcgo/lockosthread.c |  13 +
 .../runtime/testdata/testprogcgo/lockosthread.go   | 111 +++
 libgo/go/runtime/testdata/testprogcgo/sigstack.go  |  95 +++
 .../runtime/testdata/testprogcgo/stack_windows.go  |  54 ++
 libgo/go/runtime/time.go                           | 256 ++++--
 libgo/go/runtime/trace.go                          | 112 +--
 libgo/go/runtime/trace/example_test.go             |  41 +
 libgo/go/runtime/trace/trace.go                    |  37 +-
 libgo/go/runtime/trace/trace_test.go               |  57 ++
 libgo/go/runtime/traceback_gccgo.go                |   4 +-
 94 files changed, 4855 insertions(+), 1973 deletions(-)
 create mode 100644 libgo/go/runtime/mwbbuf.go
 create mode 100644 libgo/go/runtime/testdata/testprog/gettid.go
 create mode 100644 libgo/go/runtime/testdata/testprog/gettid_none.go
 create mode 100644 libgo/go/runtime/testdata/testprog/lockosthread.go
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/catchpanic.go
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/lockosthread.c
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/lockosthread.go
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/sigstack.go
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/stack_windows.go
 create mode 100644 libgo/go/runtime/trace/example_test.go

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go
index 174320f..7c98f1b 100644
--- a/libgo/go/runtime/alg.go
+++ b/libgo/go/runtime/alg.go
@@ -57,18 +57,15 @@ const (
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
+
 func memhash8(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 1)
 }
+
 func memhash16(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 2)
 }
-func memhash32(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 4)
-}
-func memhash64(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 8)
-}
+
 func memhash128(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 16)
 }
diff --git a/libgo/go/runtime/append_test.go b/libgo/go/runtime/append_test.go
index 6bd8f3b..ef1e812 100644
--- a/libgo/go/runtime/append_test.go
+++ b/libgo/go/runtime/append_test.go
@@ -18,42 +18,52 @@ func BenchmarkMakeSlice(b *testing.B) {
 	}
 }
 
-func BenchmarkGrowSliceBytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]byte(nil), x...)
-	}
-}
-
-func BenchmarkGrowSliceInts(b *testing.B) {
-	b.StopTimer()
-	var x = make([]int, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]int(nil), x...)
-	}
-}
-
-func BenchmarkGrowSlicePtr(b *testing.B) {
-	b.StopTimer()
-	var x = make([]*byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]*byte(nil), x...)
-	}
-}
+type (
+	struct24 struct{ a, b, c int64 }
+	struct32 struct{ a, b, c, d int64 }
+	struct40 struct{ a, b, c, d, e int64 }
+)
 
-type struct24 struct{ a, b, c int64 }
+func BenchmarkGrowSlice(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		x := make([]byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]byte(nil), x...)
+		}
+	})
+	b.Run("Int", func(b *testing.B) {
+		x := make([]int, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]int(nil), x...)
+		}
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		x := make([]*byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]*byte(nil), x...)
+		}
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("24", func(b *testing.B) {
+			x := make([]struct24, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct24(nil), x...)
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			x := make([]struct32, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct32(nil), x...)
+			}
+		})
+		b.Run("40", func(b *testing.B) {
+			x := make([]struct40, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct40(nil), x...)
+			}
+		})
 
-func BenchmarkGrowSliceStruct24Bytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]struct24, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]struct24(nil), x...)
-	}
+	})
 }
 
 func BenchmarkAppend(b *testing.B) {
diff --git a/libgo/go/runtime/cgo_gccgo.go b/libgo/go/runtime/cgo_gccgo.go
index c3bf955..05be496 100644
--- a/libgo/go/runtime/cgo_gccgo.go
+++ b/libgo/go/runtime/cgo_gccgo.go
@@ -27,6 +27,13 @@ var iscgo bool
 // The extra M must be created before any C/C++ code calls cgocallback.
 var cgoHasExtraM bool
 
+// cgoAlwaysFalse is a boolean value that is always false.
+// The cgo-generated code says if cgoAlwaysFalse { cgoUse(p) }.
+// The compiler cannot see that cgoAlwaysFalse is always false,
+// so it emits the test and keeps the call, giving the desired
+// escape analysis result. The test is cheaper than the call.
+var cgoAlwaysFalse bool
+
 // Cgocall prepares to call from code written in Go to code written in
 // C/C++. This takes the current goroutine out of the Go scheduler, as
 // though it were making a system call. Otherwise the program can
@@ -37,12 +44,11 @@ var cgoHasExtraM bool
 //     defer syscall.Cgocalldone()
 //     cfunction()
 func Cgocall() {
-	lockOSThread()
 	mp := getg().m
 	mp.ncgocall++
 	mp.ncgo++
-	mp.incgo = true
 	entersyscall(0)
+	mp.incgo = true
 }
 
 // CgocallDone prepares to return to Go code from C/C++ code.
@@ -59,8 +65,6 @@ func CgocallDone() {
 	if readgstatus(gp)&^_Gscan == _Gsyscall {
 		exitsyscall(0)
 	}
-
-	unlockOSThread()
 }
 
 // CgocallBack is used when calling from C/C++ code into Go code.
@@ -78,6 +82,8 @@ func CgocallBack() {
 		mp.dropextram = true
 	}
 
+	lockOSThread()
+
 	exitsyscall(0)
 	gp.m.incgo = false
 
@@ -100,6 +106,8 @@ func CgocallBack() {
 // CgocallBackDone prepares to return to C/C++ code that has called
 // into Go code.
 func CgocallBackDone() {
+	unlockOSThread()
+
 	// If we are the top level Go function called from C/C++, then
 	// we need to release the m. But don't release it if we are
 	// panicing; since this is the top level, we are going to
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 4a416fb..9d16120 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -234,10 +234,8 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 				// No more possible pointers.
 				break
 			}
-			if hbits.isPointer() {
-				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
-					panic(errorString(msg))
-				}
+			if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+				panic(errorString(msg))
 			}
 			hbits = hbits.next()
 		}
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index 30f054b..b85b519 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -16,6 +16,10 @@ const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
 
 // cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
 // It throws if the program is storing a Go pointer into non-Go memory.
+//
+// This is called from the write barrier, so its entire call tree must
+// be nosplit.
+//
 //go:nosplit
 //go:nowritebarrier
 func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index 7bb919c..8db728d 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -64,11 +64,19 @@ type waitq struct {
 }
 
 //go:linkname reflect_makechan reflect.makechan
-func reflect_makechan(t *chantype, size int64) *hchan {
+func reflect_makechan(t *chantype, size int) *hchan {
 	return makechan(t, size)
 }
 
-func makechan(t *chantype, size int64) *hchan {
+func makechan64(t *chantype, size int64) *hchan {
+	if int64(int(size)) != size {
+		panic(plainError("makechan: size out of range"))
+	}
+
+	return makechan(t, int(size))
+}
+
+func makechan(t *chantype, size int) *hchan {
 	elem := t.elem
 
 	// compiler checks this but be safe.
@@ -78,29 +86,33 @@ func makechan(t *chantype, size int64) *hchan {
 	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
 		throw("makechan: bad alignment")
 	}
-	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/elem.size) {
+
+	if size < 0 || uintptr(size) > maxSliceCap(elem.size) || uintptr(size)*elem.size > _MaxMem-hchanSize {
 		panic(plainError("makechan: size out of range"))
 	}
 
+	// Hchan does not contain pointers interesting for GC when elements stored in buf do not contain pointers.
+	// buf points into the same allocation, elemtype is persistent.
+	// SudoG's are referenced from their owning thread so they can't be collected.
+	// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
 	var c *hchan
-	if elem.kind&kindNoPointers != 0 || size == 0 {
-		// Allocate memory in one call.
-		// Hchan does not contain pointers interesting for GC in this case:
-		// buf points into the same allocation, elemtype is persistent.
-		// SudoG's are referenced from their owning thread so they can't be collected.
-		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
+	switch {
+	case size == 0 || elem.size == 0:
+		// Queue or element size is zero.
+		c = (*hchan)(mallocgc(hchanSize, nil, true))
+		// Race detector uses this location for synchronization.
+		c.buf = unsafe.Pointer(c)
+	case elem.kind&kindNoPointers != 0:
+		// Elements do not contain pointers.
+		// Allocate hchan and buf in one call.
 		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
-		if size > 0 && elem.size != 0 {
-			c.buf = add(unsafe.Pointer(c), hchanSize)
-		} else {
-			// race detector uses this location for synchronization
-			// Also prevents us from pointing beyond the allocation (see issue 9401).
-			c.buf = unsafe.Pointer(c)
-		}
-	} else {
+		c.buf = add(unsafe.Pointer(c), hchanSize)
+	default:
+		// Elements contain pointers.
 		c = new(hchan)
-		c.buf = newarray(elem, int(size))
+		c.buf = mallocgc(uintptr(size)*elem.size, elem, true)
 	}
+
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
@@ -119,7 +131,7 @@ func chanbuf(c *hchan, i uint) unsafe.Pointer {
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
-	chansend(c, elem, true, getcallerpc(unsafe.Pointer(&c)))
+	chansend(c, elem, true, getcallerpc())
 }
 
 /*
@@ -223,7 +235,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	mysg.elem = ep
 	mysg.waitlink = nil
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.waiting = mysg
 	gp.param = nil
@@ -331,7 +343,7 @@ func closechan(c *hchan) {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&c))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(c), callerpc, funcPC(closechan))
 		racerelease(unsafe.Pointer(c))
 	}
@@ -508,7 +520,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	mysg.waitlink = nil
 	gp.waiting = mysg
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
@@ -603,7 +615,7 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 //	}
 //
 func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
-	return chansend(c, elem, false, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, false, getcallerpc())
 }
 
 // compiler implements
@@ -653,7 +665,7 @@ func selectnbrecv2(elem unsafe.Pointer, received *bool, c *hchan) (selected bool
 
 //go:linkname reflect_chansend reflect.chansend
 func reflect_chansend(c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
-	return chansend(c, elem, !nb, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, !nb, getcallerpc())
 }
 
 //go:linkname reflect_chanrecv reflect.chanrecv
@@ -712,10 +724,16 @@ func (q *waitq) dequeue() *sudog {
 			sgp.next = nil // mark as removed (see dequeueSudog)
 		}
 
-		// if sgp participates in a select and is already signaled, ignore it
-		if sgp.selectdone != nil {
-			// claim the right to signal
-			if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
+		// if a goroutine was put on this queue because of a
+		// select, there is a small window between the goroutine
+		// being woken up by a different case and it grabbing the
+		// channel locks. Once it has the lock
+		// it removes itself from the queue, so we won't see it after that.
+		// We use a flag in the G struct to tell us when someone
+		// else has won the race to signal this goroutine but the goroutine
+		// hasn't removed itself from the queue yet.
+		if sgp.isSelect {
+			if !atomic.Cas(&sgp.g.selectDone, 0, 1) {
 				continue
 			}
 		}
diff --git a/libgo/go/runtime/chan_test.go b/libgo/go/runtime/chan_test.go
index b96af8a..29fb321 100644
--- a/libgo/go/runtime/chan_test.go
+++ b/libgo/go/runtime/chan_test.go
@@ -5,6 +5,8 @@
 package runtime_test
 
 import (
+	"internal/testenv"
+	"math"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -435,6 +437,65 @@ func TestSelectStress(t *testing.T) {
 	wg.Wait()
 }
 
+func TestSelectFairness(t *testing.T) {
+	const trials = 10000
+	if runtime.GOOS == "linux" && runtime.GOARCH == "ppc64le" {
+		testenv.SkipFlaky(t, 22047)
+	}
+	c1 := make(chan byte, trials+1)
+	c2 := make(chan byte, trials+1)
+	for i := 0; i < trials+1; i++ {
+		c1 <- 1
+		c2 <- 2
+	}
+	c3 := make(chan byte)
+	c4 := make(chan byte)
+	out := make(chan byte)
+	done := make(chan byte)
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for {
+			var b byte
+			select {
+			case b = <-c3:
+			case b = <-c4:
+			case b = <-c1:
+			case b = <-c2:
+			}
+			select {
+			case out <- b:
+			case <-done:
+				return
+			}
+		}
+	}()
+	cnt1, cnt2 := 0, 0
+	for i := 0; i < trials; i++ {
+		switch b := <-out; b {
+		case 1:
+			cnt1++
+		case 2:
+			cnt2++
+		default:
+			t.Fatalf("unexpected value %d on channel", b)
+		}
+	}
+	// If the select in the goroutine is fair,
+	// cnt1 and cnt2 should be about the same value.
+	// With 10,000 trials, the expected margin of error at
+	// a confidence level of five nines is 4.4172 / (2 * Sqrt(10000)).
+	r := float64(cnt1) / trials
+	e := math.Abs(r - 0.5)
+	t.Log(cnt1, cnt2, r, e)
+	if e > 4.4172/(2*math.Sqrt(trials)) {
+		t.Errorf("unfair select: in %d trials, results were %d, %d", trials, cnt1, cnt2)
+	}
+	close(done)
+	wg.Wait()
+}
+
 func TestChanSendInterface(t *testing.T) {
 	type mt struct{}
 	m := &mt{}
@@ -674,6 +735,55 @@ done:
 	<-ready2
 }
 
+type struct0 struct{}
+
+func BenchmarkMakeChan(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		var x chan byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Int", func(b *testing.B) {
+		var x chan int
+		for i := 0; i < b.N; i++ {
+			x = make(chan int, 8)
+		}
+		close(x)
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		var x chan *byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan *byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("0", func(b *testing.B) {
+			var x chan struct0
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct0, 8)
+			}
+			close(x)
+		})
+		b.Run("32", func(b *testing.B) {
+			var x chan struct32
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct32, 8)
+			}
+			close(x)
+		})
+		b.Run("40", func(b *testing.B) {
+			var x chan struct40
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct40, 8)
+			}
+			close(x)
+		})
+	})
+}
+
 func BenchmarkChanNonblocking(b *testing.B) {
 	myc := make(chan int)
 	b.RunParallel(func(pb *testing.PB) {
diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go
index b031b1a..91cdf2b 100644
--- a/libgo/go/runtime/cpuprof.go
+++ b/libgo/go/runtime/cpuprof.go
@@ -160,6 +160,7 @@ func (p *cpuProfile) addExtra() {
 			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
 		cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
+		p.lostExtra = 0
 	}
 }
 
diff --git a/libgo/go/runtime/cputicks.go b/libgo/go/runtime/cputicks.go
index ee15aca..7e62dc1 100644
--- a/libgo/go/runtime/cputicks.go
+++ b/libgo/go/runtime/cputicks.go
@@ -4,6 +4,6 @@
 
 package runtime
 
-// careful: cputicks is not guaranteed to be monotonic!  In particular, we have
+// careful: cputicks is not guaranteed to be monotonic! In particular, we have
 // noticed drift between cpus on certain os/arch combinations. See issue 8976.
 func cputicks() int64
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index b798731..7e14e57 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -13,6 +13,7 @@ import (
 	"os"
 	"os/exec"
 	"runtime"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -113,7 +114,7 @@ func TestCgoExternalThreadSIGPROF(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -136,7 +137,7 @@ func TestCgoExternalThreadSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -203,14 +204,14 @@ func TestCgoCheckBytes(t *testing.T) {
 	const tries = 10
 	var tot1, tot2 time.Duration
 	for i := 0; i < tries; i++ {
-		cmd := testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, "GODEBUG=cgocheck=0", fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start := time.Now()
 		cmd.Run()
 		d1 := time.Since(start)
 
-		cmd = testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd = testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start = time.Now()
@@ -251,7 +252,7 @@ func TestCgoCCodeSIGPROF(t *testing.T) {
 
 func TestCgoCrashTraceback(t *testing.T) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	if runtime.Compiler == "gccgo" {
@@ -279,7 +280,7 @@ func TestCgoTracebackContext(t *testing.T) {
 
 func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	if runtime.Compiler == "gccgo" {
@@ -292,7 +293,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, runArg)).CombinedOutput()
 	if err != nil {
 		if testenv.Builder() == "linux-amd64-alpine" {
 			// See Issue 18243 and Issue 19938.
@@ -304,7 +305,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -339,7 +340,7 @@ func TestCgoPprof(t *testing.T) {
 }
 
 func TestCgoPprofPIE(t *testing.T) {
-	testCgoPprof(t, "-ldflags=-extldflags=-pie", "CgoPprof")
+	testCgoPprof(t, "-buildmode=pie", "CgoPprof")
 }
 
 func TestCgoPprofThread(t *testing.T) {
@@ -371,7 +372,7 @@ func TestRaceProf(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -400,7 +401,7 @@ func TestRaceSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
 	if err != nil {
 		t.Logf("%s\n", got)
 		t.Fatal(err)
@@ -423,3 +424,72 @@ func TestCgoNumGoroutine(t *testing.T) {
 		t.Errorf("expected %q got %v", want, got)
 	}
 }
+
+func TestCatchPanic(t *testing.T) {
+	t.Parallel()
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	case "darwin":
+		if runtime.GOARCH == "amd64" {
+			t.Skipf("crash() on darwin/amd64 doesn't raise SIGABRT")
+		}
+	}
+
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprogcgo")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, early := range []bool{true, false} {
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCatchPanic"))
+		// Make sure a panic results in a crash.
+		cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
+		if early {
+			// Tell testprogcgo to install an early signal handler for SIGABRT
+			cmd.Env = append(cmd.Env, "CGOCATCHPANIC_EARLY_HANDLER=1")
+		}
+		if out, err := cmd.CombinedOutput(); err != nil {
+			t.Errorf("testprogcgo CgoCatchPanic failed: %v\n%s", err, out)
+		}
+	}
+}
+
+func TestCgoLockOSThreadExit(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	testLockOSThreadExit(t, "testprogcgo")
+}
+
+func TestWindowsStackMemoryCgo(t *testing.T) {
+	if runtime.GOOS != "windows" {
+		t.Skip("skipping windows specific test")
+	}
+	testenv.SkipFlaky(t, 22575)
+	o := runTestProg(t, "testprogcgo", "StackMemory")
+	stackUsage, err := strconv.Atoi(o)
+	if err != nil {
+		t.Fatalf("Failed to read stack usage: %v", err)
+	}
+	if expected, got := 100<<10, stackUsage; got > expected {
+		t.Fatalf("expected < %d bytes of memory per thread, got %d", expected, got)
+	}
+}
+
+func TestSigStackSwapping(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skip("no sigaltstack on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "SigStack")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 1cde6bf..8ec0348 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -32,25 +32,6 @@ func TestMain(m *testing.M) {
 	os.Exit(status)
 }
 
-func testEnv(cmd *exec.Cmd) *exec.Cmd {
-	if cmd.Env != nil {
-		panic("environment already set")
-	}
-	for _, env := range os.Environ() {
-		// Exclude GODEBUG from the environment to prevent its output
-		// from breaking tests that are trying to parse other command output.
-		if strings.HasPrefix(env, "GODEBUG=") {
-			continue
-		}
-		// Exclude GOTRACEBACK for the same reason.
-		if strings.HasPrefix(env, "GOTRACEBACK=") {
-			continue
-		}
-		cmd.Env = append(cmd.Env, env)
-	}
-	return cmd
-}
-
 var testprog struct {
 	sync.Mutex
 	dir    string
@@ -62,7 +43,11 @@ type buildexe struct {
 	err error
 }
 
-func runTestProg(t *testing.T, binary, name string) string {
+func runTestProg(t *testing.T, binary, name string, env ...string) string {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	testenv.MustHaveGoBuild(t)
 
 	exe, err := buildTestProg(t, binary)
@@ -70,7 +55,11 @@ func runTestProg(t *testing.T, binary, name string) string {
 		t.Fatal(err)
 	}
 
-	cmd := testEnv(exec.Command(exe, name))
+	cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
+	cmd.Env = append(cmd.Env, env...)
+	if testing.Short() {
+		cmd.Env = append(cmd.Env, "RUNTIME_TEST_SHORT=1")
+	}
 	var b bytes.Buffer
 	cmd.Stdout = &b
 	cmd.Stderr = &b
@@ -111,6 +100,10 @@ func runTestProg(t *testing.T, binary, name string) string {
 }
 
 func buildTestProg(t *testing.T, binary string, flags ...string) (string, error) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	checkStaleRuntime(t)
 
 	testprog.Lock()
@@ -139,7 +132,7 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	exe := filepath.Join(testprog.dir, name+".exe")
 	cmd := exec.Command(testenv.GoToolPath(t), append([]string{"build", "-o", exe}, flags...)...)
 	cmd.Dir = "testdata/" + binary
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		target.err = fmt.Errorf("building %s %v: %v\n%s", binary, flags, err, out)
 		testprog.target[name] = target
@@ -158,14 +151,14 @@ var (
 func checkStaleRuntime(t *testing.T) {
 	staleRuntimeOnce.Do(func() {
 		// 'go run' uses the installed copy of runtime.a, which may be out of date.
-		out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.Stale}}", "runtime")).CombinedOutput()
+		out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.Stale}}", "runtime")).CombinedOutput()
 		if err != nil {
 			staleRuntimeErr = fmt.Errorf("failed to execute 'go list': %v\n%v", err, string(out))
 			return
 		}
 		if string(out) != "false\n" {
 			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
-			out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
+			out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
 			if err != nil {
 				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
 			}
@@ -483,7 +476,7 @@ func TestMemPprof(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "MemProf")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "MemProf")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -491,7 +484,7 @@ func TestMemPprof(t *testing.T) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -606,7 +599,7 @@ func TestPanicRace(t *testing.T) {
 	const tries = 10
 retry:
 	for i := 0; i < tries; i++ {
-		got, err := testEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
+		got, err := testenv.CleanCmdEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
 		if err == nil {
 			t.Logf("try %d: program exited successfully, should have failed", i+1)
 			continue
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 09c2547..584a6c7 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -65,13 +65,13 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source: %v\n%s", err, out)
 	}
 
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 
 	// Set GOGC=off. Because of golang.org/issue/10958, the tight
@@ -132,6 +132,7 @@ import (
 	"fmt"
 	"os"
 	"runtime"
+	"time"
 )
 
 func main() {
@@ -149,6 +150,8 @@ func main() {
 		<-c
 	}
 
+	time.Sleep(time.Millisecond)
+
 	// Tell our parent that all the goroutines are executing.
 	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
 		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
@@ -184,7 +187,7 @@ func TestPanicSystemstack(t *testing.T) {
 
 	t.Parallel()
 	cmd := exec.Command(os.Args[0], "testPanicSystemstackInternal")
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 	pr, pw, err := os.Pipe()
 	if err != nil {
@@ -249,7 +252,7 @@ func TestSignalExitStatus(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	err = testEnv(exec.Command(exe, "SignalExitStatus")).Run()
+	err = testenv.CleanCmdEnv(exec.Command(exe, "SignalExitStatus")).Run()
 	if err == nil {
 		t.Error("test program succeeded unexpectedly")
 	} else if ee, ok := err.(*exec.ExitError); !ok {
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index fdd7346..7cddd29 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -15,9 +15,6 @@ import (
 // The number of logical CPUs on the local machine can be queried with NumCPU.
 // This call will go away when the scheduler improves.
 func GOMAXPROCS(n int) int {
-	if n > _MaxGomaxprocs {
-		n = _MaxGomaxprocs
-	}
 	lock(&sched.lock)
 	ret := int(gomaxprocs)
 	unlock(&sched.lock)
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 6325dcb..e385f14 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -149,12 +149,19 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 	}
 }
 
-var StringHash = stringHash
-var BytesHash = bytesHash
-var Int32Hash = int32Hash
-var Int64Hash = int64Hash
-var EfaceHash = efaceHash
-var IfaceHash = ifaceHash
+var (
+	StringHash = stringHash
+	BytesHash  = bytesHash
+	Int32Hash  = int32Hash
+	Int64Hash  = int64Hash
+	MemHash    = memhash
+	MemHash32  = memhash32
+	MemHash64  = memhash64
+	EfaceHash  = efaceHash
+	IfaceHash  = ifaceHash
+)
+
+var UseAeshash = &useAeshash
 
 func MemclrBytes(b []byte) {
 	s := (*slice)(unsafe.Pointer(&b))
@@ -364,3 +371,27 @@ func (rw *RWMutex) Lock() {
 func (rw *RWMutex) Unlock() {
 	rw.rw.unlock()
 }
+
+func MapBucketsCount(m map[int]int) int {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return 1 << h.B
+}
+
+func MapBucketsPointerIsNil(m map[int]int) bool {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return h.buckets == nil
+}
+
+func LockOSCounts() (external, internal uint32) {
+	g := getg()
+	if g.m.lockedExt+g.m.lockedInt == 0 {
+		if g.lockedm != 0 {
+			panic("lockedm on non-locked goroutine")
+		}
+	} else {
+		if g.lockedm == 0 {
+			panic("nil lockedm on locked goroutine")
+		}
+	}
+	return g.m.lockedExt, g.m.lockedInt
+}
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 6ca9789..36787e3 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -184,8 +184,8 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 // program counter adjustment.
 func Callers(skip int, pc []uintptr) int
 
-// GOROOT returns the root of the Go tree.
-// It uses the GOROOT environment variable, if set,
+// GOROOT returns the root of the Go tree. It uses the
+// GOROOT environment variable, if set at process start,
 // or else the root used during the Go build.
 func GOROOT() string {
 	s := gogetenv("GOROOT")
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index f14e0d5..a8c52d2 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -10,12 +10,14 @@ import (
 	"reflect"
 	"runtime"
 	"runtime/debug"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
 )
 
 func TestGcSys(t *testing.T) {
+	t.Skip("does not test anything; https://golang.org/issue/23343")
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
@@ -171,7 +173,7 @@ func TestPeriodicGC(t *testing.T) {
 	// slack if things are slow.
 	var numGCs uint32
 	const want = 2
-	for i := 0; i < 20 && numGCs < want; i++ {
+	for i := 0; i < 200 && numGCs < want; i++ {
 		time.Sleep(5 * time.Millisecond)
 
 		// Test that periodic GC actually happened.
@@ -501,3 +503,142 @@ func BenchmarkReadMemStats(b *testing.B) {
 
 	hugeSink = nil
 }
+
+func TestUserForcedGC(t *testing.T) {
+	// Test that runtime.GC() triggers a GC even if GOGC=off.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+	var ms1, ms2 runtime.MemStats
+	runtime.ReadMemStats(&ms1)
+	runtime.GC()
+	runtime.ReadMemStats(&ms2)
+	if ms1.NumGC == ms2.NumGC {
+		t.Fatalf("runtime.GC() did not trigger GC")
+	}
+	if ms1.NumForcedGC == ms2.NumForcedGC {
+		t.Fatalf("runtime.GC() was not accounted in NumForcedGC")
+	}
+}
+
+func writeBarrierBenchmark(b *testing.B, f func()) {
+	runtime.GC()
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	//b.Logf("heap size: %d MB", ms.HeapAlloc>>20)
+
+	// Keep GC running continuously during the benchmark, which in
+	// turn keeps the write barrier on continuously.
+	var stop uint32
+	done := make(chan bool)
+	go func() {
+		for atomic.LoadUint32(&stop) == 0 {
+			runtime.GC()
+		}
+		close(done)
+	}()
+	defer func() {
+		atomic.StoreUint32(&stop, 1)
+		<-done
+	}()
+
+	b.ResetTimer()
+	f()
+	b.StopTimer()
+}
+
+func BenchmarkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large tree both so the GC runs for a while and
+	// so we have a data structure to manipulate the pointers of.
+	type node struct {
+		l, r *node
+	}
+	var wbRoots []*node
+	var mkTree func(level int) *node
+	mkTree = func(level int) *node {
+		if level == 0 {
+			return nil
+		}
+		n := &node{mkTree(level - 1), mkTree(level - 1)}
+		if level == 10 {
+			// Seed GC with enough early pointers so it
+			// doesn't accidentally switch to mark 2 when
+			// it only has the top of the tree.
+			wbRoots = append(wbRoots, n)
+		}
+		return n
+	}
+	const depth = 22 // 64 MB
+	root := mkTree(22)
+
+	writeBarrierBenchmark(b, func() {
+		var stack [depth]*node
+		tos := -1
+
+		// There are two write barriers per iteration, so i+=2.
+		for i := 0; i < b.N; i += 2 {
+			if tos == -1 {
+				stack[0] = root
+				tos = 0
+			}
+
+			// Perform one step of reversing the tree.
+			n := stack[tos]
+			if n.l == nil {
+				tos--
+			} else {
+				n.l, n.r = n.r, n.l
+				stack[tos] = n.l
+				stack[tos+1] = n.r
+				tos++
+			}
+
+			if i%(1<<12) == 0 {
+				// Avoid non-preemptible loops (see issue #10958).
+				runtime.Gosched()
+			}
+		}
+	})
+
+	runtime.KeepAlive(wbRoots)
+}
+
+func BenchmarkBulkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large set of objects we can copy around.
+	const heapSize = 64 << 20
+	type obj [16]*byte
+	ptrs := make([]*obj, heapSize/unsafe.Sizeof(obj{}))
+	for i := range ptrs {
+		ptrs[i] = new(obj)
+	}
+
+	writeBarrierBenchmark(b, func() {
+		const blockSize = 1024
+		var pos int
+		for i := 0; i < b.N; i += blockSize {
+			// Rotate block.
+			block := ptrs[pos : pos+blockSize]
+			first := block[0]
+			copy(block, block[1:])
+			block[blockSize-1] = first
+
+			pos += blockSize
+			if pos+blockSize > len(ptrs) {
+				pos = 0
+			}
+
+			runtime.Gosched()
+		}
+	})
+
+	runtime.KeepAlive(ptrs)
+}
diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go
index dd2e657..401fe28 100644
--- a/libgo/go/runtime/hash32.go
+++ b/libgo/go/runtime/hash32.go
@@ -86,6 +86,32 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 4*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 8*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= readUnaligned32(add(p, 4))
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go
index f7d4a6f2..5912943 100644
--- a/libgo/go/runtime/hash64.go
+++ b/libgo/go/runtime/hash64.go
@@ -86,6 +86,28 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 4*hashkey[0])
+	v := uint64(readUnaligned32(p))
+	h ^= v
+	h ^= v << 32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 8*hashkey[0])
+	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
index 167c49e..54c9160 100644
--- a/libgo/go/runtime/hash_test.go
+++ b/libgo/go/runtime/hash_test.go
@@ -14,6 +14,40 @@ import (
 	"unsafe"
 )
 
+func TestMemHash32Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [4]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash32(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 4)
+		if got != want {
+			t.Errorf("MemHash32(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
+func TestMemHash64Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [8]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash64(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 8)
+		if got != want {
+			t.Errorf("MemHash64(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
diff --git a/libgo/go/runtime/hashmap.go b/libgo/go/runtime/hashmap.go
index a3e50cd..a1fe49e 100644
--- a/libgo/go/runtime/hashmap.go
+++ b/libgo/go/runtime/hashmap.go
@@ -63,6 +63,8 @@ import (
 // themselves, so that the compiler will export them.
 //
 //go:linkname makemap runtime.makemap
+//go:linkname makemap64 runtime.makemap64
+//go:linkname makemap_small runtime.makemap_small
 //go:linkname mapaccess1 runtime.mapaccess1
 //go:linkname mapaccess2 runtime.mapaccess2
 //go:linkname mapaccess1_fat runtime.mapaccess1_fat
@@ -77,8 +79,10 @@ const (
 	bucketCntBits = 3
 	bucketCnt     = 1 << bucketCntBits
 
-	// Maximum average load of a bucket that triggers growth.
-	loadFactor = 6.5
+	// Maximum average load of a bucket that triggers growth is 6.5.
+	// Represent as loadFactorNum/loadFactDen, to allow integer math.
+	loadFactorNum = 13
+	loadFactorDen = 2
 
 	// Maximum key or value size to keep inline (instead of mallocing per element).
 	// Must fit in a uint8.
@@ -137,12 +141,13 @@ type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
-	// alive, we store pointers to all overflow buckets in hmap.overflow.
-	// Overflow is used only if key and value do not contain pointers.
-	// overflow[0] contains overflow buckets for hmap.buckets.
-	// overflow[1] contains overflow buckets for hmap.oldbuckets.
+	// alive, we store pointers to all overflow buckets in hmap.overflow and h.map.oldoverflow.
+	// overflow and oldoverflow are only used if key and value do not contain pointers.
+	// overflow contains overflow buckets for hmap.buckets.
+	// oldoverflow contains overflow buckets for hmap.oldbuckets.
 	// The indirection allows to store a pointer to the slice in hiter.
-	overflow [2]*[]*bmap
+	overflow    *[]*bmap
+	oldoverflow *[]*bmap
 
 	// nextOverflow holds a pointer to a free overflow bucket.
 	nextOverflow *bmap
@@ -171,7 +176,8 @@ type hiter struct {
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
 	bptr        *bmap          // current bucket
-	overflow    [2]*[]*bmap    // keeps overflow buckets alive
+	overflow    *[]*bmap       // keeps overflow buckets of hmap.buckets alive
+	oldoverflow *[]*bmap       // keeps overflow buckets of hmap.oldbuckets alive
 	startBucket uintptr        // bucket iteration started at
 	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
 	wrapped     bool           // already wrapped around from end of bucket array to beginning
@@ -181,6 +187,28 @@ type hiter struct {
 	checkBucket uintptr
 }
 
+// bucketShift returns 1<<b, optimized for code generation.
+func bucketShift(b uint8) uintptr {
+	if sys.GoarchAmd64|sys.GoarchAmd64p32|sys.Goarch386 != 0 {
+		b &= sys.PtrSize*8 - 1 // help x86 archs remove shift overflow checks
+	}
+	return uintptr(1) << b
+}
+
+// bucketMask returns 1<<b - 1, optimized for code generation.
+func bucketMask(b uint8) uintptr {
+	return bucketShift(b) - 1
+}
+
+// tophash calculates the tophash value for hash.
+func tophash(hash uintptr) uint8 {
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	return top
+}
+
 func evacuated(b *bmap) bool {
 	h := b.tophash[0]
 	return h > empty && h < minTopHash
@@ -194,6 +222,10 @@ func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
 	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
 }
 
+func (b *bmap) keys() unsafe.Pointer {
+	return add(unsafe.Pointer(b), dataOffset)
+}
+
 // incrnoverflow increments h.noverflow.
 // noverflow counts the number of overflow buckets.
 // This is used to trigger same-size map growth.
@@ -242,7 +274,7 @@ func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
 	h.incrnoverflow()
 	if t.bucket.kind&kindNoPointers != 0 {
 		h.createOverflow()
-		*h.extra.overflow[0] = append(*h.extra.overflow[0], ovf)
+		*h.extra.overflow = append(*h.extra.overflow, ovf)
 	}
 	b.setoverflow(t, ovf)
 	return ovf
@@ -252,97 +284,69 @@ func (h *hmap) createOverflow() {
 	if h.extra == nil {
 		h.extra = new(mapextra)
 	}
-	if h.extra.overflow[0] == nil {
-		h.extra.overflow[0] = new([]*bmap)
+	if h.extra.overflow == nil {
+		h.extra.overflow = new([]*bmap)
 	}
 }
 
-// makemap implements a Go map creation make(map[k]v, hint)
+func makemap64(t *maptype, hint int64, h *hmap) *hmap {
+	if int64(int(hint)) != hint {
+		hint = 0
+	}
+	return makemap(t, int(hint), h)
+}
+
+// makehmap_small implements Go map creation for make(map[k]v) and
+// make(map[k]v, hint) when hint is known to be at most bucketCnt
+// at compile time and the map needs to be allocated on the heap.
+func makemap_small() *hmap {
+	h := new(hmap)
+	h.hash0 = fastrand()
+	return h
+}
+
+// makemap implements Go map creation for make(map[k]v, hint).
 // If the compiler has determined that the map or the first bucket
 // can be created on the stack, h and/or bucket may be non-nil.
 // If h != nil, the map can be created directly in h.
-// If bucket != nil, bucket can be used as the first bucket.
-func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
-	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != t.hmap.size {
+// If h.buckets != nil, bucket pointed to can be used as the first bucket.
+func makemap(t *maptype, hint int, h *hmap) *hmap {
+	// The size of hmap should be 48 bytes on 64 bit
+	// and 28 bytes on 32 bit platforms.
+	if sz := unsafe.Sizeof(hmap{}); sz != 8+5*sys.PtrSize {
 		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
 		throw("bad hmap size")
 	}
 
-	if hint < 0 || hint > int64(maxSliceCap(t.bucket.size)) {
+	if hint < 0 || hint > int(maxSliceCap(t.bucket.size)) {
 		hint = 0
 	}
 
-	if !ismapkey(t.key) {
-		throw("runtime.makemap: unsupported map key type")
-	}
-
-	// check compiler's and reflect's math
-	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
-		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
-		throw("key size wrong")
-	}
-	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
-		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
-		throw("value size wrong")
-	}
-
-	// invariants we depend on. We should probably check these at compile time
-	// somewhere, but for now we'll do it here.
-	if t.key.align > bucketCnt {
-		throw("key align too big")
-	}
-	if t.elem.align > bucketCnt {
-		throw("value align too big")
-	}
-	if t.key.size%uintptr(t.key.align) != 0 {
-		throw("key size not a multiple of key align")
-	}
-	if t.elem.size%uintptr(t.elem.align) != 0 {
-		throw("value size not a multiple of value align")
-	}
-	if bucketCnt < 8 {
-		throw("bucketsize too small for proper alignment")
-	}
-	if dataOffset%uintptr(t.key.align) != 0 {
-		throw("need padding in bucket (key)")
-	}
-	if dataOffset%uintptr(t.elem.align) != 0 {
-		throw("need padding in bucket (value)")
+	// initialize Hmap
+	if h == nil {
+		h = (*hmap)(newobject(t.hmap))
 	}
+	h.hash0 = fastrand()
 
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
-	for ; overLoadFactor(hint, B); B++ {
+	for overLoadFactor(hint, B) {
+		B++
 	}
+	h.B = B
 
 	// allocate initial hash table
 	// if B == 0, the buckets field is allocated lazily later (in mapassign)
 	// If hint is large zeroing this memory could take a while.
-	buckets := bucket
-	var extra *mapextra
-	if B != 0 {
+	if h.B != 0 {
 		var nextOverflow *bmap
-		buckets, nextOverflow = makeBucketArray(t, B)
+		h.buckets, nextOverflow = makeBucketArray(t, h.B)
 		if nextOverflow != nil {
-			extra = new(mapextra)
-			extra.nextOverflow = nextOverflow
+			h.extra = new(mapextra)
+			h.extra.nextOverflow = nextOverflow
 		}
 	}
 
-	// initialize Hmap
-	if h == nil {
-		h = (*hmap)(newobject(t.hmap))
-	}
-	h.count = 0
-	h.B = B
-	h.extra = extra
-	h.flags = 0
-	h.hash0 = fastrand()
-	h.buckets = buckets
-	h.oldbuckets = nil
-	h.nevacuate = 0
-	h.noverflow = 0
-
 	return h
 }
 
@@ -353,7 +357,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 // hold onto it for very long.
 func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess1)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -370,7 +374,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -382,11 +386,8 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -403,16 +404,13 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 				return v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess2)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -429,7 +427,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -441,11 +439,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -462,11 +457,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 				return v, true
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 // returns both key and value. Used by map iterator
@@ -477,7 +469,7 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -489,11 +481,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -510,11 +499,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 				return k, v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return nil, nil
-		}
 	}
+	return nil, nil
 }
 
 func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
@@ -539,7 +525,7 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapassign)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -559,19 +545,16 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
+	top := tophash(hash)
 
 	var inserti *uint8
 	var insertk unsafe.Pointer
@@ -611,7 +594,7 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
@@ -651,7 +634,7 @@ done:
 
 func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapdelete)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -674,16 +657,14 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	// in which case we have not actually done a write (delete).
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -696,53 +677,58 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 			if !equalfn(key, k2) {
 				continue
 			}
+			// Only clear key if there are pointers in it.
 			if t.indirectkey {
 				*(*unsafe.Pointer)(k) = nil
-			} else {
-				typedmemclr(t.key, k)
+			} else if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
 			}
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*uintptr(t.keysize) + i*uintptr(t.valuesize))
-			if t.indirectvalue {
-				*(*unsafe.Pointer)(v) = nil
-			} else {
-				typedmemclr(t.elem, v)
+			// Only clear value if there are pointers in it.
+			if t.indirectvalue || t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(v) = nil
+				} else {
+					memclrHasPointers(v, t.elem.size)
+				}
 			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
 
+// mapiterinit initializes the hiter struct used for ranging over maps.
+// The hiter struct pointed to by 'it' is allocated on the stack
+// by the compilers order pass or on the heap by reflect_mapiterinit.
+// Both need to have zeroed hiter since the struct contains pointers.
+// Gccgo-specific: *it need not be zeroed by the compiler,
+//  and it's cheaper to zero it here.
 func mapiterinit(t *maptype, h *hmap, it *hiter) {
-	// Clear pointer fields so garbage collector does not complain.
 	it.key = nil
 	it.value = nil
 	it.t = nil
 	it.h = nil
 	it.buckets = nil
 	it.bptr = nil
-	it.overflow[0] = nil
-	it.overflow[1] = nil
+	it.overflow = nil
+	it.oldoverflow = nil
+	it.wrapped = false
+	it.i = 0
+	it.checkBucket = 0
 
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiterinit))
 	}
 
 	if h == nil || h.count == 0 {
-		it.key = nil
-		it.value = nil
 		return
 	}
 
@@ -762,6 +748,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 		// while we are iterating.
 		h.createOverflow()
 		it.overflow = h.extra.overflow
+		it.oldoverflow = h.extra.oldoverflow
 	}
 
 	// decide where to start
@@ -769,16 +756,14 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	if h.B > 31-bucketCntBits {
 		r += uintptr(fastrand()) << 31
 	}
-	it.startBucket = r & (uintptr(1)<<h.B - 1)
+	it.startBucket = r & bucketMask(h.B)
 	it.offset = uint8(r >> h.B & (bucketCnt - 1))
 
 	// iterator state
 	it.bucket = it.startBucket
-	it.wrapped = false
-	it.bptr = nil
 
 	// Remember we have an iterator.
-	// Can run concurrently with another hash_iter_init().
+	// Can run concurrently with another mapiterinit().
 	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
 		atomic.Or8(&h.flags, iterator|oldIterator)
 	}
@@ -789,7 +774,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 func mapiternext(it *hiter) {
 	h := it.h
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &it */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiternext))
 	}
 	if h.flags&hashWriting != 0 {
@@ -829,7 +814,7 @@ next:
 			checkBucket = noCheck
 		}
 		bucket++
-		if bucket == uintptr(1)<<it.B {
+		if bucket == bucketShift(it.B) {
 			bucket = 0
 			it.wrapped = true
 		}
@@ -837,90 +822,75 @@ next:
 	}
 	for ; i < bucketCnt; i++ {
 		offi := (i + it.offset) & (bucketCnt - 1)
+		if b.tophash[offi] == empty || b.tophash[offi] == evacuatedEmpty {
+			continue
+		}
 		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
+		if t.indirectkey {
+			k = *((*unsafe.Pointer)(k))
+		}
 		v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.valuesize))
-		if b.tophash[offi] != empty && b.tophash[offi] != evacuatedEmpty {
-			if checkBucket != noCheck && !h.sameSizeGrow() {
-				// Special case: iterator was started during a grow to a larger size
-				// and the grow is not done yet. We're working on a bucket whose
-				// oldbucket has not been evacuated yet. Or at least, it wasn't
-				// evacuated when we started the bucket. So we're iterating
-				// through the oldbucket, skipping any keys that will go
-				// to the other new bucket (each oldbucket expands to two
-				// buckets during a grow).
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || equalfn(k2, k2) {
-					// If the item in the oldbucket is not destined for
-					// the current new bucket in the iteration, skip it.
-					hash := hashfn(k2, uintptr(h.hash0))
-					if hash&(uintptr(1)<<it.B-1) != checkBucket {
-						continue
-					}
-				} else {
-					// Hash isn't repeatable if k != k (NaNs).  We need a
-					// repeatable and randomish choice of which direction
-					// to send NaNs during evacuation. We'll use the low
-					// bit of tophash to decide which way NaNs go.
-					// NOTE: this case is why we need two evacuate tophash
-					// values, evacuatedX and evacuatedY, that differ in
-					// their low bit.
-					if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
-						continue
-					}
-				}
-			}
-			if b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY {
-				// this is the golden data, we can return it.
-				if t.indirectkey {
-					k = *((*unsafe.Pointer)(k))
-				}
-				it.key = k
-				if t.indirectvalue {
-					v = *((*unsafe.Pointer)(v))
+		if checkBucket != noCheck && !h.sameSizeGrow() {
+			// Special case: iterator was started during a grow to a larger size
+			// and the grow is not done yet. We're working on a bucket whose
+			// oldbucket has not been evacuated yet. Or at least, it wasn't
+			// evacuated when we started the bucket. So we're iterating
+			// through the oldbucket, skipping any keys that will go
+			// to the other new bucket (each oldbucket expands to two
+			// buckets during a grow).
+			if t.reflexivekey || equalfn(k, k) {
+				// If the item in the oldbucket is not destined for
+				// the current new bucket in the iteration, skip it.
+				hash := hashfn(k, uintptr(h.hash0))
+				if hash&bucketMask(it.B) != checkBucket {
+					continue
 				}
-				it.value = v
 			} else {
-				// The hash table has grown since the iterator was started.
-				// The golden data for this key is now somewhere else.
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || equalfn(k2, k2) {
-					// Check the current hash table for the data.
-					// This code handles the case where the key
-					// has been deleted, updated, or deleted and reinserted.
-					// NOTE: we need to regrab the key as it has potentially been
-					// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
-					rk, rv := mapaccessK(t, h, k2)
-					if rk == nil {
-						continue // key has been deleted
-					}
-					it.key = rk
-					it.value = rv
-				} else {
-					// if key!=key then the entry can't be deleted or
-					// updated, so we can just return it. That's lucky for
-					// us because when key!=key we can't look it up
-					// successfully in the current table.
-					it.key = k2
-					if t.indirectvalue {
-						v = *((*unsafe.Pointer)(v))
-					}
-					it.value = v
+				// Hash isn't repeatable if k != k (NaNs).  We need a
+				// repeatable and randomish choice of which direction
+				// to send NaNs during evacuation. We'll use the low
+				// bit of tophash to decide which way NaNs go.
+				// NOTE: this case is why we need two evacuate tophash
+				// values, evacuatedX and evacuatedY, that differ in
+				// their low bit.
+				if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
+					continue
 				}
 			}
-			it.bucket = bucket
-			if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
-				it.bptr = b
+		}
+		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
+			!(t.reflexivekey || equalfn(k, k)) {
+			// This is the golden data, we can return it.
+			// OR
+			// key!=key, so the entry can't be deleted or updated, so we can just return it.
+			// That's lucky for us because when key!=key we can't look it up successfully.
+			it.key = k
+			if t.indirectvalue {
+				v = *((*unsafe.Pointer)(v))
 			}
-			it.i = i + 1
-			it.checkBucket = checkBucket
-			return
+			it.value = v
+		} else {
+			// The hash table has grown since the iterator was started.
+			// The golden data for this key is now somewhere else.
+			// Check the current hash table for the data.
+			// This code handles the case where the key
+			// has been deleted, updated, or deleted and reinserted.
+			// NOTE: we need to regrab the key as it has potentially been
+			// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
+			rk, rv := mapaccessK(t, h, k)
+			if rk == nil {
+				continue // key has been deleted
+			}
+			it.key = rk
+			it.value = rv
 		}
+		it.bucket = bucket
+		if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+			it.bptr = b
+		}
+		it.i = i + 1
+		it.checkBucket = checkBucket
+		return
 	}
 	b = b.overflow(t)
 	i = 0
@@ -928,7 +898,7 @@ next:
 }
 
 func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow *bmap) {
-	base := uintptr(1 << b)
+	base := bucketShift(b)
 	nbuckets := base
 	// For small b, overflow buckets are unlikely.
 	// Avoid the overhead of the calculation.
@@ -936,7 +906,7 @@ func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow
 		// Add on the estimated number of overflow buckets
 		// required to insert the median number of elements
 		// used with this value of b.
-		nbuckets += 1 << (b - 4)
+		nbuckets += bucketShift(b - 4)
 		sz := t.bucket.size * nbuckets
 		up := roundupsize(sz)
 		if up != sz {
@@ -962,7 +932,7 @@ func hashGrow(t *maptype, h *hmap) {
 	// Otherwise, there are too many overflow buckets,
 	// so keep the same number of buckets and "grow" laterally.
 	bigger := uint8(1)
-	if !overLoadFactor(int64(h.count), h.B) {
+	if !overLoadFactor(h.count+1, h.B) {
 		bigger = 0
 		h.flags |= sameSizeGrow
 	}
@@ -981,13 +951,13 @@ func hashGrow(t *maptype, h *hmap) {
 	h.nevacuate = 0
 	h.noverflow = 0
 
-	if h.extra != nil && h.extra.overflow[0] != nil {
+	if h.extra != nil && h.extra.overflow != nil {
 		// Promote current overflow buckets to the old generation.
-		if h.extra.overflow[1] != nil {
-			throw("overflow is not nil")
+		if h.extra.oldoverflow != nil {
+			throw("oldoverflow is not nil")
 		}
-		h.extra.overflow[1] = h.extra.overflow[0]
-		h.extra.overflow[0] = nil
+		h.extra.oldoverflow = h.extra.overflow
+		h.extra.overflow = nil
 	}
 	if nextOverflow != nil {
 		if h.extra == nil {
@@ -1001,9 +971,8 @@ func hashGrow(t *maptype, h *hmap) {
 }
 
 // overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
-func overLoadFactor(count int64, B uint8) bool {
-	// TODO: rewrite to use integer math and comparison?
-	return count >= bucketCnt && float32(count) >= loadFactor*float32((uint64(1)<<B))
+func overLoadFactor(count int, B uint8) bool {
+	return count > bucketCnt && uintptr(count) > loadFactorNum*(bucketShift(B)/loadFactorDen)
 }
 
 // tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
@@ -1014,10 +983,11 @@ func tooManyOverflowBuckets(noverflow uint16, B uint8) bool {
 	// If the threshold is too high, maps that grow and shrink can hold on to lots of unused memory.
 	// "too many" means (approximately) as many overflow buckets as regular buckets.
 	// See incrnoverflow for more details.
-	if B < 16 {
-		return noverflow >= uint16(1)<<B
+	if B > 15 {
+		B = 15
 	}
-	return noverflow >= 1<<15
+	// The compiler doesn't see here that B < 16; mask B to generate shorter shift code.
+	return noverflow >= uint16(1)<<(B&15)
 }
 
 // growing reports whether h is growing. The growth may be to the same size or bigger.
@@ -1036,7 +1006,7 @@ func (h *hmap) noldbuckets() uintptr {
 	if !h.sameSizeGrow() {
 		oldB--
 	}
-	return uintptr(1) << oldB
+	return bucketShift(oldB)
 }
 
 // oldbucketmask provides a mask that can be applied to calculate n % noldbuckets().
@@ -1060,33 +1030,38 @@ func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
 	return evacuated(b)
 }
 
+// evacDst is an evacuation destination.
+type evacDst struct {
+	b *bmap          // current destination bucket
+	i int            // key/val index into b
+	k unsafe.Pointer // pointer to current key storage
+	v unsafe.Pointer // pointer to current value storage
+}
+
 func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
 	newbit := h.noldbuckets()
 	hashfn := t.key.hashfn
-	equalfn := t.key.equalfn
 	if !evacuated(b) {
 		// TODO: reuse overflow buckets instead of using new ones, if there
 		// is no iterator using the old buckets.  (If !oldIterator.)
 
-		var (
-			x, y   *bmap          // current low/high buckets in new map
-			xi, yi int            // key/val indices into x and y
-			xk, yk unsafe.Pointer // pointers to current x and y key storage
-			xv, yv unsafe.Pointer // pointers to current x and y value storage
-		)
-		x = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
-		xi = 0
-		xk = add(unsafe.Pointer(x), dataOffset)
-		xv = add(xk, bucketCnt*uintptr(t.keysize))
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*uintptr(t.keysize))
+
 		if !h.sameSizeGrow() {
 			// Only calculate y pointers if we're growing bigger.
 			// Otherwise GC can see bad pointers.
-			y = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
-			yi = 0
-			yk = add(unsafe.Pointer(y), dataOffset)
-			yv = add(yk, bucketCnt*uintptr(t.keysize))
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*uintptr(t.keysize))
 		}
+
 		for ; b != nil; b = b.overflow(t) {
 			k := add(unsafe.Pointer(b), dataOffset)
 			v := add(k, bucketCnt*uintptr(t.keysize))
@@ -1103,122 +1078,102 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 				if t.indirectkey {
 					k2 = *((*unsafe.Pointer)(k2))
 				}
-				useX := true
+				var useY uint8
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/value to bucket x or bucket y).
 					hash := hashfn(k2, uintptr(h.hash0))
-					if h.flags&iterator != 0 {
-						if !t.reflexivekey && !equalfn(k2, k2) {
-							// If key != key (NaNs), then the hash could be (and probably
-							// will be) entirely different from the old hash. Moreover,
-							// it isn't reproducible. Reproducibility is required in the
-							// presence of iterators, as our evacuation decision must
-							// match whatever decision the iterator made.
-							// Fortunately, we have the freedom to send these keys either
-							// way. Also, tophash is meaningless for these kinds of keys.
-							// We let the low bit of tophash drive the evacuation decision.
-							// We recompute a new random tophash for the next level so
-							// these keys will get evenly distributed across all buckets
-							// after multiple grows.
-							if top&1 != 0 {
-								hash |= newbit
-							} else {
-								hash &^= newbit
-							}
-							top = uint8(hash >> (sys.PtrSize*8 - 8))
-							if top < minTopHash {
-								top += minTopHash
-							}
+					if h.flags&iterator != 0 && !t.reflexivekey && !t.key.equalfn(k2, k2) {
+						// If key != key (NaNs), then the hash could be (and probably
+						// will be) entirely different from the old hash. Moreover,
+						// it isn't reproducible. Reproducibility is required in the
+						// presence of iterators, as our evacuation decision must
+						// match whatever decision the iterator made.
+						// Fortunately, we have the freedom to send these keys either
+						// way. Also, tophash is meaningless for these kinds of keys.
+						// We let the low bit of tophash drive the evacuation decision.
+						// We recompute a new random tophash for the next level so
+						// these keys will get evenly distributed across all buckets
+						// after multiple grows.
+						useY = top & 1
+						top = tophash(hash)
+					} else {
+						if hash&newbit != 0 {
+							useY = 1
 						}
 					}
-					useX = hash&newbit == 0
 				}
-				if useX {
-					b.tophash[i] = evacuatedX
-					if xi == bucketCnt {
-						newx := h.newoverflow(t, x)
-						x = newx
-						xi = 0
-						xk = add(unsafe.Pointer(x), dataOffset)
-						xv = add(xk, bucketCnt*uintptr(t.keysize))
-					}
-					x.tophash[xi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(xk) = k2 // copy pointer
-					} else {
-						typedmemmove(t.key, xk, k) // copy value
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(xv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, xv, v)
-					}
-					xi++
-					xk = add(xk, uintptr(t.keysize))
-					xv = add(xv, uintptr(t.valuesize))
+
+				if evacuatedX+1 != evacuatedY {
+					throw("bad evacuatedN")
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*uintptr(t.keysize))
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+				if t.indirectkey {
+					*(*unsafe.Pointer)(dst.k) = k2 // copy pointer
 				} else {
-					b.tophash[i] = evacuatedY
-					if yi == bucketCnt {
-						newy := h.newoverflow(t, y)
-						y = newy
-						yi = 0
-						yk = add(unsafe.Pointer(y), dataOffset)
-						yv = add(yk, bucketCnt*uintptr(t.keysize))
-					}
-					y.tophash[yi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(yk) = k2
-					} else {
-						typedmemmove(t.key, yk, k)
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(yv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, yv, v)
-					}
-					yi++
-					yk = add(yk, uintptr(t.keysize))
-					yv = add(yv, uintptr(t.valuesize))
+					typedmemmove(t.key, dst.k, k) // copy value
 				}
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(dst.v) = *(*unsafe.Pointer)(v)
+				} else {
+					typedmemmove(t.elem, dst.v, v)
+				}
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, uintptr(t.keysize))
+				dst.v = add(dst.v, uintptr(t.valuesize))
 			}
 		}
 		// Unlink the overflow buckets & clear key/value to help GC.
-		if h.flags&oldIterator == 0 {
-			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
 			// Preserve b.tophash because the evacuation
 			// state is maintained there.
-			if t.bucket.kind&kindNoPointers == 0 {
-				memclrHasPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			} else {
-				memclrNoHeapPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			}
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
 		}
 	}
 
-	// Advance evacuation mark
 	if oldbucket == h.nevacuate {
-		h.nevacuate = oldbucket + 1
-		// Experiments suggest that 1024 is overkill by at least an order of magnitude.
-		// Put it in there as a safeguard anyway, to ensure O(1) behavior.
-		stop := h.nevacuate + 1024
-		if stop > newbit {
-			stop = newbit
-		}
-		for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
-			h.nevacuate++
-		}
-		if h.nevacuate == newbit { // newbit == # of oldbuckets
-			// Growing is all done. Free old main bucket array.
-			h.oldbuckets = nil
-			// Can discard old overflow buckets as well.
-			// If they are still referenced by an iterator,
-			// then the iterator holds a pointers to the slice.
-			if h.extra != nil {
-				h.extra.overflow[1] = nil
-			}
-			h.flags &^= sameSizeGrow
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {
+	h.nevacuate++
+	// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+	// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+	stop := h.nevacuate + 1024
+	if stop > newbit {
+		stop = newbit
+	}
+	for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+		h.nevacuate++
+	}
+	if h.nevacuate == newbit { // newbit == # of oldbuckets
+		// Growing is all done. Free old main bucket array.
+		h.oldbuckets = nil
+		// Can discard old overflow buckets as well.
+		// If they are still referenced by an iterator,
+		// then the iterator holds a pointers to the slice.
+		if h.extra != nil {
+			h.extra.oldoverflow = nil
 		}
+		h.flags &^= sameSizeGrow
 	}
 }
 
@@ -1230,7 +1185,45 @@ func ismapkey(t *_type) bool {
 
 //go:linkname reflect_makemap reflect.makemap
 func reflect_makemap(t *maptype, cap int) *hmap {
-	return makemap(t, int64(cap), nil, nil)
+	// Check invariants and reflects math.
+	if sz := unsafe.Sizeof(hmap{}); sz != t.hmap.size {
+		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
+		throw("bad hmap size")
+	}
+	if !ismapkey(t.key) {
+		throw("runtime.reflect_makemap: unsupported map key type")
+	}
+	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
+		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
+		throw("key size wrong")
+	}
+	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
+		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
+		throw("value size wrong")
+	}
+	if t.key.align > bucketCnt {
+		throw("key align too big")
+	}
+	if t.elem.align > bucketCnt {
+		throw("value align too big")
+	}
+	if t.key.size%uintptr(t.key.align) != 0 {
+		throw("key size not a multiple of key align")
+	}
+	if t.elem.size%uintptr(t.elem.align) != 0 {
+		throw("value size not a multiple of value align")
+	}
+	if bucketCnt < 8 {
+		throw("bucketsize too small for proper alignment")
+	}
+	if dataOffset%uintptr(t.key.align) != 0 {
+		throw("need padding in bucket (key)")
+	}
+	if dataOffset%uintptr(t.elem.align) != 0 {
+		throw("need padding in bucket (value)")
+	}
+
+	return makemap(t, cap, nil)
 }
 
 //go:linkname reflect_mapaccess reflect.mapaccess
@@ -1277,7 +1270,7 @@ func reflect_maplen(h *hmap) int {
 		return 0
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &h */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(reflect_maplen))
 	}
 	return h.count
diff --git a/libgo/go/runtime/hashmap_fast.go b/libgo/go/runtime/hashmap_fast.go
index bec8fda..e0fc981 100644
--- a/libgo/go/runtime/hashmap_fast.go
+++ b/libgo/go/runtime/hashmap_fast.go
@@ -11,7 +11,7 @@ import (
 
 func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -26,7 +26,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -39,28 +39,19 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -75,7 +66,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -88,28 +79,19 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -124,7 +106,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -137,28 +119,19 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -173,7 +146,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -186,28 +159,19 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -222,13 +186,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -239,13 +199,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -275,7 +231,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	}
 dohash:
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -287,34 +243,24 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -329,13 +275,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -346,13 +288,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -382,7 +320,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	}
 dohash:
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -394,37 +332,113 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
 	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
+				}
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
 				continue
 			}
-			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
-			}
+			inserti = i
+			insertb = b
+			goto done
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
 		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*uint32)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
 }
 
-func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
 	}
 	if h.flags&hashWriting != 0 {
@@ -436,38 +450,35 @@ func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast32(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*4)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
 				}
 				continue
 			}
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*4)))
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -481,25 +492,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*4)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -512,7 +524,7 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
@@ -524,30 +536,26 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*8)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -555,7 +563,8 @@ again:
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			insertb = b
+			inserti = i
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -569,25 +578,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*8)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*uint64)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -595,48 +605,131 @@ done:
 	return val
 }
 
-func mapassign_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
-		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	key := stringStructOf(&ky)
-	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				continue
+			}
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			insertb = b
+			inserti = i
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
 	}
 
-	var inserti *uint8
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&s)
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if b.tophash[i] == empty && insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -648,7 +741,8 @@ again:
 				continue
 			}
 			// already have a mapping for key. Update it.
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -662,25 +756,25 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*2*sys.PtrSize)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = top // mask inserti to avoid bounds checks
 
-	// store new key/value at insert position
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*2*sys.PtrSize)
+	// store new key at insert position
 	*((*stringStruct)(insertk)) = *key
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*2*sys.PtrSize+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -690,7 +784,7 @@ done:
 
 func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -705,38 +799,32 @@ func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+		growWork_fast32(t, h, bucket)
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if key != *(*uint32)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint32)(add(unsafe.Pointer(b), dataOffset+i*4))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*4 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -745,7 +833,7 @@ done:
 
 func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -760,38 +848,32 @@ func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if key != *(*uint64)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint64)(add(unsafe.Pointer(b), dataOffset+i*8))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*8 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -800,7 +882,7 @@ done:
 
 func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -816,43 +898,340 @@ func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
 				continue
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*2*sys.PtrSize + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
+			// Clear key's pointer.
+			k.str = nil
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
+			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
+
+func growWork_fast32(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast32(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast32(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast32(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*4)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*4)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*4)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 4), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*4)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if sys.PtrSize == 4 && t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+				} else {
+					*(*uint32)(dst.k) = *(*uint32)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 4)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_fast64(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast64(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast64(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast64(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*8)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*8)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*8)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 8), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*8)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					if sys.PtrSize == 8 {
+						writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+					} else {
+						// There are three ways to squeeze at least one 32 bit pointer into 64 bits.
+						// Give up and call typedmemmove.
+						typedmemmove(t.key, dst.k, k)
+					}
+				} else {
+					*(*uint64)(dst.k) = *(*uint64)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 8)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_faststr(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_faststr(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_faststr(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_faststr(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*2*sys.PtrSize)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*2*sys.PtrSize)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*2*sys.PtrSize)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 2*sys.PtrSize), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*2*sys.PtrSize)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				*(*string)(dst.k) = *(*string)(k)
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 2*sys.PtrSize)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index 166199b..a4b168d 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -200,7 +200,6 @@ func dumptype(t *_type) {
 
 // dump an object
 func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
-	dumpbvtypes(&bv, obj)
 	dumpint(tagObject)
 	dumpint(uint64(uintptr(obj)))
 	dumpmemrange(obj, size)
@@ -539,16 +538,6 @@ func dumpfields(bv bitvector) {
 	dumpint(fieldKindEol)
 }
 
-// The heap dump reader needs to be able to disambiguate
-// Eface entries. So it needs to know every type that might
-// appear in such an entry. The following routine accomplishes that.
-// TODO(rsc, khr): Delete - no longer possible.
-
-// Dump all the types that appear in the type field of
-// any Eface described by this bit vector.
-func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
-}
-
 func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	// Extend the temp buffer if necessary.
 	nptr := size / sys.PtrSize
diff --git a/libgo/go/runtime/internal/atomic/atomic_test.go b/libgo/go/runtime/internal/atomic/atomic_test.go
index 879a82f..b697aa8 100644
--- a/libgo/go/runtime/internal/atomic/atomic_test.go
+++ b/libgo/go/runtime/internal/atomic/atomic_test.go
@@ -52,7 +52,7 @@ func TestXadduintptr(t *testing.T) {
 // Tests that xadduintptr correctly updates 64-bit values. The place where
 // we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
 func TestXadduintptrOnUint64(t *testing.T) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		// On big endian architectures, we never use xadduintptr to update
 		// 64-bit values and hence we skip the test.  (Note that functions
 		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
diff --git a/libgo/go/runtime/internal/sys/sys.go b/libgo/go/runtime/internal/sys/sys.go
index 586a763..9d9ac45 100644
--- a/libgo/go/runtime/internal/sys/sys.go
+++ b/libgo/go/runtime/internal/sys/sys.go
@@ -6,9 +6,9 @@
 // constants used by the runtime.
 package sys
 
-// The next line makes 'go generate' write the zgen_*.go files with
+// The next line makes 'go generate' write the zgo*.go files with
 // per-OS and per-arch information, including constants
-// named goos_$GOOS and goarch_$GOARCH for every
+// named Goos$GOOS and Goarch$GOARCH for every
 // known GOOS and GOARCH. The constant is 1 on the
 // current system, 0 otherwise; multiplying by them is
 // useful for defining GOOS- or GOARCH-specific constants.
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 52a2376..d000b11 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -83,7 +83,7 @@ Loop:
 			// for this lock, chained through m->nextwaitm.
 			// Queue this M.
 			for {
-				gp.m.nextwaitm = v &^ mutex_locked
+				gp.m.nextwaitm = muintptr(v &^ mutex_locked)
 				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|mutex_locked) {
 					break
 				}
@@ -115,8 +115,8 @@ func unlock(l *mutex) {
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
-			mp = (*m)(unsafe.Pointer(v &^ mutex_locked))
-			if atomic.Casuintptr(&l.key, v, mp.nextwaitm) {
+			mp = muintptr(v &^ mutex_locked).ptr()
+			if atomic.Casuintptr(&l.key, v, uintptr(mp.nextwaitm)) {
 				// Dequeued an M.  Wake it.
 				semawakeup(mp)
 				break
@@ -152,7 +152,7 @@ func notewakeup(n *note) {
 	case v == 0:
 		// Nothing was waiting. Done.
 	case v == mutex_locked:
-		// Two notewakeups!  Not allowed.
+		// Two notewakeups! Not allowed.
 		throw("notewakeup - double wakeup")
 	default:
 		// Must be the waiting m. Wake it up.
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index 796cd8a..88e4ba3 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -546,9 +546,8 @@ func nextFreeFast(s *mspan) gclinkptr {
 			}
 			s.allocCache >>= uint(theBit + 1)
 			s.freeindex = freeidx
-			v := gclinkptr(result*s.elemsize + s.base())
 			s.allocCount++
-			return v
+			return gclinkptr(result*s.elemsize + s.base())
 		}
 	}
 	return 0
@@ -877,6 +876,9 @@ func reflect_unsafe_New(typ *_type) unsafe.Pointer {
 
 // newarray allocates an array of n elements of type typ.
 func newarray(typ *_type, n int) unsafe.Pointer {
+	if n == 1 {
+		return mallocgc(typ.size, typ, true)
+	}
 	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
 		panic(plainError("runtime: allocation size out of range"))
 	}
@@ -893,11 +895,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	mProf_Malloc(x, size)
 }
 
-// nextSample returns the next sampling point for heap profiling.
-// It produces a random variable with a geometric distribution and
-// mean MemProfileRate. This is done by generating a uniformly
-// distributed random number and applying the cumulative distribution
-// function for an exponential.
+// nextSample returns the next sampling point for heap profiling. The goal is
+// to sample allocations on average every MemProfileRate bytes, but with a
+// completely random distribution over the allocation timeline; this
+// corresponds to a Poisson process with parameter MemProfileRate. In Poisson
+// processes, the distance between two samples follows the exponential
+// distribution (exp(MemProfileRate)), so the best return value is a random
+// number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() int32 {
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
@@ -906,25 +910,29 @@ func nextSample() int32 {
 		}
 	}
 
-	period := MemProfileRate
+	return fastexprand(MemProfileRate)
+}
 
-	// make nextSample not overflow. Maximum possible step is
-	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+// fastexprand returns a random number from an exponential distribution with
+// the specified mean.
+func fastexprand(mean int) int32 {
+	// Avoid overflow. Maximum possible step is
+	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean.
 	switch {
-	case period > 0x7000000:
-		period = 0x7000000
-	case period == 0:
+	case mean > 0x7000000:
+		mean = 0x7000000
+	case mean == 0:
 		return 0
 	}
 
-	// Let m be the sample rate,
-	// the probability distribution function is m*exp(-mx), so the CDF is
-	// p = 1 - exp(-mx), so
-	// q = 1 - p == exp(-mx)
-	// log_e(q) = -mx
-	// -log_e(q)/m = x
-	// x = -log_e(q) * period
-	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	// Take a random sample of the exponential distribution exp(-mean*x).
+	// The probability distribution function is mean*exp(-mean*x), so the CDF is
+	// p = 1 - exp(-mean*x), so
+	// q = 1 - p == exp(-mean*x)
+	// log_e(q) = -mean*x
+	// -log_e(q)/mean = x
+	// x = -log_e(q) * mean
+	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency
 	const randomBitCount = 26
 	q := fastrand()%(1<<randomBitCount) + 1
 	qlog := fastlog2(float64(q)) - randomBitCount
@@ -932,7 +940,7 @@ func nextSample() int32 {
 		qlog = 0
 	}
 	const minusLog2 = -0.6931471805599453 // -ln(2)
-	return int32(qlog*(minusLog2*float64(period))) + 1
+	return int32(qlog*(minusLog2*float64(mean))) + 1
 }
 
 // nextSampleNoFP is similar to nextSample, but uses older,
@@ -950,7 +958,7 @@ func nextSampleNoFP() int32 {
 }
 
 type persistentAlloc struct {
-	base unsafe.Pointer
+	base *notInHeap
 	off  uintptr
 }
 
@@ -967,17 +975,17 @@ var globalAlloc struct {
 //
 // Consider marking persistentalloc'd types go:notinheap.
 func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
-	var p unsafe.Pointer
+	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
 	})
-	return p
+	return unsafe.Pointer(p)
 }
 
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	const (
 		chunk    = 256 << 10
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
@@ -998,7 +1006,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 
 	if size >= maxBlock {
-		return sysAlloc(size, sysStat)
+		return (*notInHeap)(sysAlloc(size, sysStat))
 	}
 
 	mp := acquirem()
@@ -1011,7 +1019,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	persistent.off = round(persistent.off, align)
 	if persistent.off+size > chunk || persistent.base == nil {
-		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		persistent.base = (*notInHeap)(sysAlloc(chunk, &memstats.other_sys))
 		if persistent.base == nil {
 			if persistent == &globalAlloc.persistentAlloc {
 				unlock(&globalAlloc.mutex)
@@ -1020,7 +1028,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		persistent.off = 0
 	}
-	p := add(persistent.base, persistent.off)
+	p := persistent.base.add(persistent.off)
 	persistent.off += size
 	releasem(mp)
 	if persistent == &globalAlloc.persistentAlloc {
@@ -1033,3 +1041,19 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	return p
 }
+
+// notInHeap is off-heap memory allocated by a lower-level allocator
+// like sysAlloc or persistentAlloc.
+//
+// In general, it's better to use real types marked as go:notinheap,
+// but this serves as a generic type for situations where that isn't
+// possible (like in the allocators).
+//
+// TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
+//
+//go:notinheap
+type notInHeap struct{}
+
+func (p *notInHeap) add(bytes uintptr) *notInHeap {
+	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
+}
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index 0d43cf6..ab580f8 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -48,9 +48,6 @@ func TestMemStats(t *testing.T) {
 	}
 	// Of the uint fields, HeapReleased, HeapIdle can be 0.
 	// PauseTotalNs can be 0 if timer resolution is poor.
-	//
-	// TODO: Test that GCCPUFraction is <= 0.99. This currently
-	// fails on windows/386. (Issue #19319)
 	fields := map[string][]func(interface{}) error{
 		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
 		"Lookups": {nz, le(1e10)}, "Mallocs": {nz, le(1e10)}, "Frees": {nz, le(1e10)},
@@ -63,7 +60,7 @@ func TestMemStats(t *testing.T) {
 		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
 		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
 		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
-		"GCCPUFraction": nil, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
+		"GCCPUFraction": {le(0.99)}, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
 		"BySize": nil,
 	}
 
diff --git a/libgo/go/runtime/map_test.go b/libgo/go/runtime/map_test.go
index 37c959f..6d7097e 100644
--- a/libgo/go/runtime/map_test.go
+++ b/libgo/go/runtime/map_test.go
@@ -249,7 +249,7 @@ func testConcurrentReadsAfterGrowth(t *testing.T, useReflect bool) {
 	numGrowStep := 250
 	numReader := 16
 	if testing.Short() {
-		numLoop, numGrowStep = 2, 500
+		numLoop, numGrowStep = 2, 100
 	}
 	for i := 0; i < numLoop; i++ {
 		m := make(map[int]int, 0)
@@ -603,6 +603,142 @@ func TestIgnoreBogusMapHint(t *testing.T) {
 	}
 }
 
+var mapSink map[int]int
+
+var mapBucketTests = [...]struct {
+	n        int // n is the number of map elements
+	noescape int // number of expected buckets for non-escaping map
+	escape   int // number of expected buckets for escaping map
+}{
+	{-(1 << 30), 1, 1},
+	{-1, 1, 1},
+	{0, 1, 1},
+	{1, 1, 1},
+	{8, 1, 1},
+	{9, 2, 2},
+	{13, 2, 2},
+	{14, 4, 4},
+	{26, 4, 4},
+}
+
+func TestMapBuckets(t *testing.T) {
+	// Test that maps of different sizes have the right number of buckets.
+	// Non-escaping maps with small buckets (like map[int]int) never
+	// have a nil bucket pointer due to starting with preallocated buckets
+	// on the stack. Escaping maps start with a non-nil bucket pointer if
+	// hint size is above bucketCnt and thereby have more than one bucket.
+	// These tests depend on bucketCnt and loadFactor* in hashmap.go.
+	t.Run("mapliteral", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := map[int]int{}
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := map[int]int{}
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("nohint", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int)
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, tt.n)
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap64", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, int64(tt.n))
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+
+}
+
 func benchmarkMapPop(b *testing.B, n int) {
 	m := map[int]int{}
 	for i := 0; i < b.N; i++ {
@@ -624,15 +760,39 @@ func BenchmarkMapPop100(b *testing.B)   { benchmarkMapPop(b, 100) }
 func BenchmarkMapPop1000(b *testing.B)  { benchmarkMapPop(b, 1000) }
 func BenchmarkMapPop10000(b *testing.B) { benchmarkMapPop(b, 10000) }
 
+var testNonEscapingMapVariable int = 8
+
 func TestNonEscapingMap(t *testing.T) {
 	t.Skip("does not work on gccgo without better escape analysis")
 	n := testing.AllocsPerRun(1000, func() {
+		m := map[int]int{}
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("mapliteral: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
 		m := make(map[int]int)
 		m[0] = 0
 	})
 	if n != 0 {
-		t.Fatalf("want 0 allocs, got %v", n)
+		t.Fatalf("no hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, 8)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with small hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, testNonEscapingMapVariable)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with variable hint: want 0 allocs, got %v", n)
 	}
+
 }
 
 func benchmarkMapAssignInt32(b *testing.B, n int) {
@@ -643,12 +803,16 @@ func benchmarkMapAssignInt32(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt32(b *testing.B, n int) {
-	a := make(map[int32]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int32(i)] = i
-	}
+	a := make(map[int32]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int32(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int32(i))
 	}
 }
@@ -661,12 +825,16 @@ func benchmarkMapAssignInt64(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt64(b *testing.B, n int) {
-	a := make(map[int64]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int64(i)] = i
-	}
+	a := make(map[int64]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int64(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int64(i))
 	}
 }
@@ -684,17 +852,23 @@ func benchmarkMapAssignStr(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteStr(b *testing.B, n int) {
-	k := make([]string, n*b.N)
-	for i := 0; i < n*b.N; i++ {
-		k[i] = strconv.Itoa(i)
-	}
-	a := make(map[string]int)
-	for i := 0; i < n*b.N; i++ {
-		a[k[i]] = i
+	i2s := make([]string, n)
+	for i := 0; i < n; i++ {
+		i2s[i] = strconv.Itoa(i)
 	}
+	a := make(map[string]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
-		delete(a, k[i])
+	k := 0
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := 0; j < n; j++ {
+				a[i2s[j]] = j
+			}
+			k = i
+			b.StartTimer()
+		}
+		delete(a, i2s[i-k])
 	}
 }
 
@@ -713,7 +887,7 @@ func BenchmarkMapAssign(b *testing.B) {
 }
 
 func BenchmarkMapDelete(b *testing.B) {
-	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 1, 2, 4))
-	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 1, 2, 4))
-	b.Run("Str", runWith(benchmarkMapDeleteStr, 1, 2, 4))
+	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 100, 1000, 10000))
+	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 100, 1000, 10000))
+	b.Run("Str", runWith(benchmarkMapDeleteStr, 100, 1000, 10000))
 }
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index d54016f0..3b8f714 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -189,6 +189,8 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 func writebarrierptr_prewrite1(dst *uintptr, src uintptr) {
 	mp := acquirem()
 	if mp.inwb || mp.dying > 0 {
+		// We explicitly allow write barriers in startpanic_m,
+		// since we're going down anyway. Ignore them here.
 		releasem(mp)
 		return
 	}
@@ -244,6 +246,10 @@ func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
 
 // typedmemmove copies a value of type t to dst from src.
 // Must be nosplit, see #16026.
+//
+// TODO: Perfect for go:nosplitrec since we can't have a safe point
+// anywhere in the bulk barrier or memmove.
+//
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {
@@ -265,8 +271,8 @@ func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 //go:linkname reflect_typedmemmove reflect.typedmemmove
 func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if raceenabled {
-		raceWriteObjectPC(typ, dst, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
-		raceReadObjectPC(typ, src, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+		raceWriteObjectPC(typ, dst, getcallerpc(), funcPC(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(), funcPC(reflect_typedmemmove))
 	}
 	if msanenabled {
 		msanwrite(dst, typ.size)
@@ -310,8 +316,12 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	dstp := dst.array
 	srcp := src.array
 
+	// The compiler emits calls to typedslicecopy before
+	// instrumentation runs, so unlike the other copying and
+	// assignment operations, it's not instrumented in the calling
+	// code and needs its own instrumentation.
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&typ))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
 		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
@@ -329,41 +339,13 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	// compiler only emits calls to typedslicecopy for types with pointers,
 	// and growslice and reflect_typedslicecopy check for pointers
 	// before calling typedslicecopy.
-	if !writeBarrier.needed {
-		memmove(dstp, srcp, uintptr(n)*typ.size)
-		return n
+	size := uintptr(n) * typ.size
+	if writeBarrier.needed {
+		bulkBarrierPreWrite(uintptr(dstp), uintptr(srcp), size)
 	}
-
-	systemstack(func() {
-		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
-			// Overlap with src before dst.
-			// Copy backward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			dstp = add(dstp, uintptr(n-1)*typ.size)
-			srcp = add(srcp, uintptr(n-1)*typ.size)
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, -typ.size)
-				srcp = add(srcp, -typ.size)
-			}
-		} else {
-			// Copy forward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, typ.size)
-				srcp = add(srcp, typ.size)
-			}
-		}
-	})
+	// See typedmemmove for a discussion of the race between the
+	// barrier and memmove.
+	memmove(dstp, srcp, size)
 	return n
 }
 
@@ -380,7 +362,7 @@ func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 
 		size := uintptr(n) * elemType.size
 		if raceenabled {
-			callerpc := getcallerpc(unsafe.Pointer(&elemType))
+			callerpc := getcallerpc()
 			pc := funcPC(reflect_typedslicecopy)
 			racewriterangepc(dst.array, size, callerpc, pc)
 			racereadrangepc(src.array, size, callerpc, pc)
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index d1a5820..a775b57 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -463,11 +463,6 @@ func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr,
 	return
 }
 
-// prefetch the bits.
-func (h heapBits) prefetch() {
-	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
-}
-
 // next returns the heapBits describing the next pointer-sized word in memory.
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
@@ -542,12 +537,13 @@ func (h heapBits) setCheckmarked(size uintptr) {
 	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 }
 
-// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
 // This executes the write barriers necessary before a memmove.
 // src, dst, and size must be pointer-aligned.
 // The range [dst, dst+size) must lie within a single object.
+// It does not perform the actual writes.
 //
 // As a special case, src == 0 indicates that this is being used for a
 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
@@ -593,12 +589,15 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 		return
 	}
 
+	buf := &getg().m.p.ptr().wbBuf
 	h := heapBitsForAddr(dst)
 	if src == 0 {
 		for i := uintptr(0); i < size; i += sys.PtrSize {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -607,7 +606,9 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -627,6 +628,7 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 	bits = addb(bits, word/8)
 	mask := uint8(1) << (word % 8)
 
+	buf := &getg().m.p.ptr().wbBuf
 	for i := uintptr(0); i < size; i += sys.PtrSize {
 		if mask == 0 {
 			bits = addb(bits, 1)
@@ -640,10 +642,14 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 		if *bits&mask != 0 {
 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
 			if src == 0 {
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			} else {
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 		}
 		mask <<= 1
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 71a2f22..766cfd1 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -96,7 +96,7 @@ func freemcache(c *mcache) {
 
 // Gets a span that has a free object in it and assigns it
 // to be the cached span for the given sizeclass. Returns this span.
-func (c *mcache) refill(spc spanClass) *mspan {
+func (c *mcache) refill(spc spanClass) {
 	_g_ := getg()
 
 	_g_.m.locks++
@@ -123,7 +123,6 @@ func (c *mcache) refill(spc spanClass) *mspan {
 
 	c.alloc[spc] = s
 	_g_.m.locks--
-	return s
 }
 
 func (c *mcache) releaseAll() {
diff --git a/libgo/go/runtime/mem_gccgo.go b/libgo/go/runtime/mem_gccgo.go
index ea3e5eb..a087945 100644
--- a/libgo/go/runtime/mem_gccgo.go
+++ b/libgo/go/runtime/mem_gccgo.go
@@ -13,9 +13,10 @@ import (
 
 // Functions called by C code.
 //go:linkname sysAlloc runtime.sysAlloc
+//go:linkname sysFree runtime.sysFree
 
 //extern mmap
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
+func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
 
 //extern munmap
 func munmap(addr unsafe.Pointer, length uintptr) int32
@@ -40,6 +41,14 @@ func init() {
 	}
 }
 
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) (unsafe.Pointer, int) {
+	p := sysMmap(addr, n, prot, flags, fd, off)
+	if uintptr(p) == _MAP_FAILED {
+		return nil, errno()
+	}
+	return p, 0
+}
+
 // NOTE: vec must be just 1 byte long here.
 // Mincore returns ENOMEM if any of the pages are unmapped,
 // but we want to know that all of the pages are unmapped.
@@ -75,31 +84,30 @@ func addrspace_free(v unsafe.Pointer, n uintptr) bool {
 	return true
 }
 
-func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer {
-	p := mmap(v, n, prot, flags, fd, offset)
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) (unsafe.Pointer, int) {
+	p, err := mmap(v, n, prot, flags, fd, offset)
 	// On some systems, mmap ignores v without
 	// MAP_FIXED, so retry if the address space is free.
 	if p != v && addrspace_free(v, n) {
-		if uintptr(p) != _MAP_FAILED {
+		if err == 0 {
 			munmap(p, n)
 		}
-		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+		p, err = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
 	}
-	return p
+	return p, err
 }
 
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED {
-		errval := errno()
-		if errval == _EACCES {
+	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if err != 0 {
+		if err == _EACCES {
 			print("runtime: mmap: access denied\n")
 			exit(2)
 		}
-		if errval == _EAGAIN {
+		if err == _EAGAIN {
 			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
 			exit(2)
 		}
@@ -225,9 +233,9 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	// if we can reserve at least 64K and check the assumption in SysMap.
 	// Only user-mode Linux (UML) rejects these requests.
 	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
-		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-		if p != v {
-			if uintptr(p) != _MAP_FAILED {
+		p, err := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+		if p != v || err != 0 {
+			if err == 0 {
 				munmap(p, 64<<10)
 			}
 			return nil
@@ -237,8 +245,8 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 		return v
 	}
 
-	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if err != 0 {
 		return nil
 	}
 	*reserved = true
@@ -259,12 +267,12 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 			// to do this - we do not on other platforms.
 			flags |= _MAP_FIXED
 		}
-		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
-		if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+		p, err := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
+		if err == _ENOMEM {
 			throw("runtime: out of memory")
 		}
-		if p != v {
-			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+		if p != v || err != 0 {
+			print("runtime: address space conflict: map(", v, ") = ", p, " (err ", err, ")\n")
 			throw("runtime: address space conflict")
 		}
 		return
@@ -275,11 +283,11 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 		// So always unmap first even if it is already unmapped.
 		munmap(v, n)
 	}
-	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
+	if err == _ENOMEM {
 		throw("runtime: out of memory")
 	}
-	if p != v {
+	if p != v || err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/libgo/go/runtime/memmove_test.go b/libgo/go/runtime/memmove_test.go
index 74b8753..62de604 100644
--- a/libgo/go/runtime/memmove_test.go
+++ b/libgo/go/runtime/memmove_test.go
@@ -9,6 +9,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"internal/race"
+	"internal/testenv"
 	. "runtime"
 	"testing"
 )
@@ -88,6 +89,10 @@ func TestMemmoveAlias(t *testing.T) {
 }
 
 func TestMemmoveLarge0x180000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
@@ -96,6 +101,10 @@ func TestMemmoveLarge0x180000(t *testing.T) {
 }
 
 func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 4353ee5..19573d8 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -419,11 +419,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 	return
 }
 
-// Mark KeepAlive as noinline so that the current compiler will ensure
-// that the argument is alive at the point of the function call.
-// If it were inlined, it would disappear, and there would be nothing
-// keeping the argument alive. Perhaps a future compiler will recognize
-// runtime.KeepAlive specially and do something more efficient.
+// Mark KeepAlive as noinline so that it is easily detectable as an intrinsic.
 //go:noinline
 
 // KeepAlive marks its argument as currently reachable.
@@ -445,4 +441,11 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 // Without the KeepAlive call, the finalizer could run at the start of
 // syscall.Read, closing the file descriptor before syscall.Read makes
 // the actual system call.
-func KeepAlive(interface{}) {}
+func KeepAlive(x interface{}) {
+	// Introduce a use of x that the compiler can't eliminate.
+	// This makes sure x is alive on entry. We need x to be alive
+	// on entry for "defer runtime.KeepAlive(x)"; see issue 21402.
+	if cgoAlwaysFalse {
+		println(x)
+	}
+}
diff --git a/libgo/go/runtime/mfinal_test.go b/libgo/go/runtime/mfinal_test.go
index 38c2623..2086e42 100644
--- a/libgo/go/runtime/mfinal_test.go
+++ b/libgo/go/runtime/mfinal_test.go
@@ -254,3 +254,24 @@ var (
 	Foo2 = &Object2{}
 	Foo1 = &Object1{}
 )
+
+func TestDeferKeepAlive(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
+	// See issue 21402.
+	t.Parallel()
+	type T *int // needs to be a pointer base type to avoid tinyalloc and its never-finalized behavior.
+	x := new(T)
+	finRun := false
+	runtime.SetFinalizer(x, func(x *T) {
+		finRun = true
+	})
+	defer runtime.KeepAlive(x)
+	runtime.GC()
+	time.Sleep(time.Second)
+	if finRun {
+		t.Errorf("finalizer ran prematurely")
+	}
+}
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index 31c4be8..626f088 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -231,6 +231,24 @@ func setGCPercent(in int32) (out int32) {
 	// Update pacing in response to gcpercent change.
 	gcSetTriggerRatio(memstats.triggerRatio)
 	unlock(&mheap_.lock)
+
+	// If we just disabled GC, wait for any concurrent GC to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		// Disable phase transitions.
+		lock(&work.sweepWaiters.lock)
+		if gcphase == _GCmark {
+			// GC is active. Wait until we reach sweeping.
+			gp := getg()
+			gp.schedlink = work.sweepWaiters.head
+			work.sweepWaiters.head.set(gp)
+			goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+		} else {
+			// GC isn't active.
+			unlock(&work.sweepWaiters.lock)
+		}
+	}
+
 	return out
 }
 
@@ -300,10 +318,10 @@ const (
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
-	// is necessary when GOMAXPROCS*gcGoalUtilization is not an
-	// integer. The fractional worker should run until it is
+	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
+	// an integer. The fractional worker should run until it is
 	// preempted and will be scheduled to pick up the fractional
-	// part of GOMAXPROCS*gcGoalUtilization.
+	// part of GOMAXPROCS*gcBackgroundUtilization.
 	gcMarkWorkerFractionalMode
 
 	// gcMarkWorkerIdleMode indicates that a P is running the mark
@@ -397,23 +415,18 @@ type gcControllerState struct {
 	assistBytesPerWork float64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
-	// time that should be spent in the fractional mark worker.
-	// For example, if the overall mark utilization goal is 25%
-	// and GOMAXPROCS is 6, one P will be a dedicated mark worker
-	// and this will be set to 0.5 so that 50% of the time some P
-	// is in a fractional mark worker. This is computed at the
-	// beginning of each cycle.
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If there goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
 	fractionalUtilizationGoal float64
 
 	_ [sys.CacheLineSize]byte
-
-	// fractionalMarkWorkersNeeded is the number of fractional
-	// mark workers that need to be started. This is either 0 or
-	// 1. This is potentially updated atomically at every
-	// scheduling point (hence it gets its own cache line).
-	fractionalMarkWorkersNeeded int64
-
-	_ [sys.CacheLineSize]byte
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -454,23 +467,33 @@ func (c *gcControllerState) startCycle() {
 		memstats.next_gc = memstats.heap_live + 1024*1024
 	}
 
-	// Compute the total mark utilization goal and divide it among
-	// dedicated and fractional workers.
-	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
-	c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
-	if c.fractionalUtilizationGoal > 0 {
-		c.fractionalMarkWorkersNeeded = 1
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
 	} else {
-		c.fractionalMarkWorkersNeeded = 0
+		c.fractionalUtilizationGoal = 0
 	}
 
 	// Clear per-P state
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
 	}
 
 	// Compute initial values for controls that are updated
@@ -483,7 +506,7 @@ func (c *gcControllerState) startCycle() {
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
 			" workers=", c.dedicatedMarkWorkersNeeded,
-			"+", c.fractionalMarkWorkersNeeded, "\n")
+			"+", c.fractionalUtilizationGoal, "\n")
 	}
 }
 
@@ -496,47 +519,73 @@ func (c *gcControllerState) startCycle() {
 // is when assists are enabled and the necessary statistics are
 // available).
 func (c *gcControllerState) revise() {
-	// Compute the expected scan work remaining.
+	gcpercent := gcpercent
+	if gcpercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcpercent = 100000
+	}
+	live := atomic.Load64(&memstats.heap_live)
+
+	var heapGoal, scanWorkExpected int64
+	if live <= memstats.next_gc {
+		// We're under the soft goal. Pace GC to complete at
+		// next_gc assuming the heap is in steady-state.
+		heapGoal = int64(memstats.next_gc)
+
+		// Compute the expected scan work remaining.
+		//
+		// This is estimated based on the expected
+		// steady-state scannable heap. For example, with
+		// GOGC=100, only half of the scannable heap is
+		// expected to be live, so that's what we target.
+		//
+		// (This is a float calculation to avoid overflowing on
+		// 100*heap_scan.)
+		scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	} else {
+		// We're past the soft goal. Pace GC so that in the
+		// worst case it will complete by the hard goal.
+		const maxOvershoot = 1.1
+		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+
+		// Compute the upper bound on the scan work remaining.
+		scanWorkExpected = int64(memstats.heap_scan)
+	}
+
+	// Compute the remaining scan work estimate.
 	//
 	// Note that we currently count allocations during GC as both
 	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so this difference won't be changed by
-	// allocations during GC.
-	//
-	// This particular estimate is a strict upper bound on the
-	// possible remaining scan work for the current heap.
-	// You might consider dividing this by 2 (or by
-	// (100+GOGC)/100) to counter this over-estimation, but
-	// benchmarks show that this has almost no effect on mean
-	// mutator utilization, heap size, or assist time and it
-	// introduces the danger of under-estimating and letting the
-	// mutator outpace the garbage collector.
-	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
-	if scanWorkExpected < 1000 {
+	// (scanWork), so allocation will change this difference will
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - c.scanWork
+	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
 		// we can miss by a little.
 		//
 		// We *do* need to enforce that this is at least 1,
 		// since marking is racy and double-scanning objects
-		// may legitimately make the expected scan work
-		// negative.
-		scanWorkExpected = 1000
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
 	}
 
 	// Compute the heap distance remaining.
-	heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
-	if heapDistance <= 0 {
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
 		// This shouldn't happen, but if it does, avoid
 		// dividing by zero or setting the assist negative.
-		heapDistance = 1
+		heapRemaining = 1
 	}
 
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
-	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
+	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
+	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -570,7 +619,7 @@ func (c *gcControllerState) endCycle() float64 {
 	assistDuration := nanotime() - c.markStartTime
 
 	// Assume background mark hit its utilization goal.
-	utilization := gcGoalUtilization
+	utilization := gcBackgroundUtilization
 	// Add assist utilization; avoid divide by zero.
 	if assistDuration > 0 {
 		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
@@ -689,51 +738,20 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		return nil
 	} else {
-		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
-			// No more workers are need right now.
-			return nil
-		}
-
-		// This P has picked the token for the fractional worker.
-		// Is the GC currently under or at the utilization goal?
-		// If so, do more work.
-		//
-		// We used to check whether doing one time slice of work
-		// would remain under the utilization goal, but that has the
-		// effect of delaying work until the mutator has run for
-		// enough time slices to pay for the work. During those time
-		// slices, write barriers are enabled, so the mutator is running slower.
-		// Now instead we do the work whenever we're under or at the
-		// utilization work and pay for it by letting the mutator run later.
-		// This doesn't change the overall utilization averages, but it
-		// front loads the GC work so that the GC finishes earlier and
-		// write barriers can be turned off sooner, effectively giving
-		// the mutator a faster machine.
-		//
-		// The old, slower behavior can be restored by setting
-		//	gcForcePreemptNS = forcePreemptNS.
-		const gcForcePreemptNS = 0
-
-		// TODO(austin): We could fast path this and basically
-		// eliminate contention on c.fractionalMarkWorkersNeeded by
-		// precomputing the minimum time at which it's worth
-		// next scheduling the fractional worker. Then Ps
-		// don't have to fight in the window where we've
-		// passed that deadline and no one has started the
-		// worker yet.
+		// Is this P behind on the fractional utilization
+		// goal?
 		//
-		// TODO(austin): Shorter preemption interval for mark
-		// worker to improve fairness and give this
-		// finer-grained control over schedule?
-		now := nanotime() - gcController.markStartTime
-		then := now + gcForcePreemptNS
-		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
-		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
-			// Nope, we'd overshoot the utilization goal
-			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - gcController.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
 			return nil
 		}
+		// Run a fractional worker.
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
@@ -746,6 +764,24 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	return gp
 }
 
+// pollFractionalWorkerExit returns true if a fractional mark worker
+// should self-preempt. It assumes it is called from the fractional
+// worker.
+func pollFractionalWorkerExit() bool {
+	// This should be kept in sync with the fractional worker
+	// scheduler logic in findRunnableGCWorker.
+	now := nanotime()
+	delta := now - gcController.markStartTime
+	if delta <= 0 {
+		return true
+	}
+	p := getg().m.p.ptr()
+	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
+	// Add some slack to the utilization goal so that the
+	// fractional worker isn't behind again the instant it exits.
+	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
+}
+
 // gcSetTriggerRatio sets the trigger ratio and updates everything
 // derived from it: the absolute trigger, the heap goal, mark pacing,
 // and sweep pacing.
@@ -860,9 +896,22 @@ func gcSetTriggerRatio(triggerRatio float64) {
 	}
 }
 
-// gcGoalUtilization is the goal CPU utilization for background
+// gcGoalUtilization is the goal CPU utilization for
 // marking as a fraction of GOMAXPROCS.
-const gcGoalUtilization = 0.25
+const gcGoalUtilization = 0.30
+
+// gcBackgroundUtilization is the fixed CPU utilization for background
+// marking. It must be <= gcGoalUtilization. The difference between
+// gcGoalUtilization and gcBackgroundUtilization will be made up by
+// mark assists. The scheduler will aim to use within 50% of this
+// goal.
+//
+// Setting this to < gcGoalUtilization avoids saturating the trigger
+// feedback controller when there are no assists, which allows it to
+// better control CPU and heap growth. However, the larger the gap,
+// the more mutator assists are expected to happen, which impact
+// mutator latency.
+const gcBackgroundUtilization = 0.25
 
 // gcCreditSlack is the amount of scan work credit that can can
 // accumulate locally before updating gcController.scanWork and,
@@ -1159,7 +1208,7 @@ func (t gcTrigger) test() bool {
 	if t.kind == gcTriggerAlways {
 		return true
 	}
-	if gcphase != _GCoff || gcpercent < 0 {
+	if gcphase != _GCoff {
 		return false
 	}
 	switch t.kind {
@@ -1170,6 +1219,9 @@ func (t gcTrigger) test() bool {
 		// own write.
 		return memstats.heap_live >= memstats.gc_trigger
 	case gcTriggerTime:
+		if gcpercent < 0 {
+			return false
+		}
 		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
 		return lastgc != 0 && t.now-lastgc > forcegcperiod
 	case gcTriggerCycle:
@@ -1236,7 +1288,7 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		}
 	}
 
-	// Ok, we're doing it!  Stop everybody else
+	// Ok, we're doing it! Stop everybody else
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1249,7 +1301,12 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 
 	gcResetMarkState()
 
-	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
+	if work.stwprocs > ncpu {
+		// This is used to compute CPU time of the STW phases,
+		// so it can't be more than ncpu, even if GOMAXPROCS is.
+		work.stwprocs = ncpu
+	}
 	work.heap0 = atomic.Load64(&memstats.heap_live)
 	work.pauseNS = 0
 	work.mode = mode
@@ -1257,6 +1314,9 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	now := nanotime()
 	work.tSweepTerm = now
 	work.pauseStart = now
+	if trace.enabled {
+		traceGCSTWStart(1)
+	}
 	systemstack(stopTheWorldWithSema)
 	// Finish sweep before we start concurrent scan.
 	systemstack(func() {
@@ -1309,11 +1369,17 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		gcController.markStartTime = now
 
 		// Concurrent mark.
-		systemstack(startTheWorldWithSema)
-		now = nanotime()
+		systemstack(func() {
+			now = startTheWorldWithSema(trace.enabled)
+		})
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	} else {
+		if trace.enabled {
+			// Switch to mark termination STW.
+			traceGCSTWDone()
+			traceGCSTWStart(0)
+		}
 		t := nanotime()
 		work.tMark, work.tMarkTerm = t, t
 		work.heapGoal = work.heap0
@@ -1356,7 +1422,8 @@ top:
 	// TODO(austin): Should dedicated workers keep an eye on this
 	// and exit gcDrain promptly?
 	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
-	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+	prevFractionalGoal := gcController.fractionalUtilizationGoal
+	gcController.fractionalUtilizationGoal = 0
 
 	if !gcBlackenPromptly {
 		// Transition from mark 1 to mark 2.
@@ -1383,6 +1450,7 @@ top:
 			// workers have exited their loop so we can
 			// start new mark 2 workers.
 			forEachP(func(_p_ *p) {
+				wbBufFlush1(_p_)
 				_p_.gcw.dispose()
 			})
 		})
@@ -1399,7 +1467,7 @@ top:
 
 		// Now we can start up mark 2 workers.
 		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
-		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+		gcController.fractionalUtilizationGoal = prevFractionalGoal
 
 		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
@@ -1414,6 +1482,9 @@ top:
 		work.tMarkTerm = now
 		work.pauseStart = now
 		getg().m.preemptoff = "gcing"
+		if trace.enabled {
+			traceGCSTWStart(0)
+		}
 		systemstack(stopTheWorldWithSema)
 		// The gcphase is _GCmark, it will transition to _GCmarktermination
 		// below. The important thing is that the wb remains active until
@@ -1574,7 +1645,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// so events don't leak into the wrong cycle.
 	mProf_NextCycle()
 
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(true) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
 	// This is relatively expensive, so we don't do it with the
@@ -1645,10 +1716,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 func gcBgMarkStartWorkers() {
 	// Background marking is performed by per-P G's. Ensure that
 	// each P has a background GC G.
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		if p.gcBgMarkWorker == 0 {
 			expectSystemGoroutine()
 			go gcBgMarkWorker(p)
@@ -1751,6 +1819,7 @@ func gcBgMarkWorker(_p_ *p) {
 		}
 
 		startTime := nanotime()
+		_p_.gcMarkWorkerStartTime = startTime
 
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@@ -1792,7 +1861,7 @@ func gcBgMarkWorker(_p_ *p) {
 				// without preemption.
 				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
-				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
 				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			}
@@ -1817,7 +1886,7 @@ func gcBgMarkWorker(_p_ *p) {
 			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
-			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+			atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
 		case gcMarkWorkerIdleMode:
 			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
@@ -1915,10 +1984,6 @@ func gcMark(start_time int64) {
 		work.helperDrainBlock = true
 	}
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
 		helpgc(int32(work.nproc))
@@ -1952,8 +2017,8 @@ func gcMark(start_time int64) {
 
 	// Double-check that all gcWork caches are empty. This should
 	// be ensured by mark 2 before we enter mark termination.
-	for i := 0; i < int(gomaxprocs); i++ {
-		gcw := &allp[i].gcw
+	for _, p := range allp {
+		gcw := &p.gcw
 		if !gcw.empty() {
 			throw("P has cached GC work at end of mark termination")
 		}
@@ -1962,10 +2027,6 @@ func gcMark(start_time int64) {
 		}
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	cachestats()
 
 	// Update the marked heap stat.
@@ -2093,18 +2154,19 @@ func clearpools() {
 	unlock(&sched.deferlock)
 }
 
-// Timing
-
-//go:nowritebarrier
+// gchelper runs mark termination tasks on Ps other than the P
+// coordinating mark termination.
+//
+// The caller is responsible for ensuring that this has a P to run on,
+// even though it's running during STW. Because of this, it's allowed
+// to have write barriers.
+//
+//go:yeswritebarrierrec
 func gchelper() {
 	_g_ := getg()
 	_g_.m.traceback = 2
 	gchelperstart()
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	// Parallel mark over GC roots and heap
 	if gcphase == _GCmarktermination {
 		gcw := &_g_.m.p.ptr().gcw
@@ -2116,10 +2178,6 @@ func gchelper() {
 		gcw.dispose()
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
 	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
@@ -2138,6 +2196,8 @@ func gchelperstart() {
 	}
 }
 
+// Timing
+
 // itoaDiv formats val/(10**dec) into buf.
 func itoaDiv(buf []byte, val uint64, dec int) []byte {
 	i := len(buf) - 1
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
index c1fa154..107a70a 100644
--- a/libgo/go/runtime/mgc_gccgo.go
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // gcRoot is a single GC root: a variable plus a ptrmask.
 type gcRoot struct {
@@ -85,3 +88,21 @@ func checkPreempt() {
 	gp.scanningself = false
 	mcall(gopreempt_m)
 }
+
+// gcWriteBarrier implements a write barrier. This is implemented in
+// assembly in the gc library, but there is no special advantage to
+// doing so with gccgo.
+//go:nosplit
+//go:nowritebarrier
+func gcWriteBarrier(dst *uintptr, src uintptr) {
+	buf := &getg().m.p.ptr().wbBuf
+	next := buf.next
+	np := next + 2*sys.PtrSize
+	buf.next = np
+	*(*uintptr)(unsafe.Pointer(next)) = src
+	*(*uintptr)(unsafe.Pointer(next + sys.PtrSize)) = *dst
+	if np >= buf.end {
+		wbBufFlush(dst, src)
+	}
+	*dst = src
+}
diff --git a/libgo/go/runtime/mgclarge.go b/libgo/go/runtime/mgclarge.go
index 757e88d..fe437bf 100644
--- a/libgo/go/runtime/mgclarge.go
+++ b/libgo/go/runtime/mgclarge.go
@@ -164,11 +164,10 @@ func (root *mTreap) insert(span *mspan) {
 	}
 }
 
-func (root *mTreap) removeNode(t *treapNode) *mspan {
+func (root *mTreap) removeNode(t *treapNode) {
 	if t.spanKey.npages != t.npagesKey {
 		throw("span and treap node npages do not match")
 	}
-	result := t.spanKey
 
 	// Rotate t down to be leaf of tree for removal, respecting priorities.
 	for t.right != nil || t.left != nil {
@@ -192,7 +191,6 @@ func (root *mTreap) removeNode(t *treapNode) *mspan {
 	t.spanKey = nil
 	t.npagesKey = 0
 	mheap_.treapalloc.free(unsafe.Pointer(t))
-	return result
 }
 
 // remove searches for, finds, removes from the treap, and returns the smallest
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index 998a830..7297fcb 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -34,13 +34,13 @@ const (
 	// span base.
 	maxObletBytes = 128 << 10
 
-	// idleCheckThreshold specifies how many units of work to do
-	// between run queue checks in an idle worker. Assuming a scan
+	// drainCheckThreshold specifies how many units of work to do
+	// between self-preemption checks in gcDrain. Assuming a scan
 	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
 	// overhead in the scan loop (the scheduler check may perform
 	// a syscall, so its overhead is nontrivial). Higher values
 	// make the system less responsive to incoming work.
-	idleCheckThreshold = 100000
+	drainCheckThreshold = 100000
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
@@ -717,6 +717,7 @@ const (
 	gcDrainNoBlock
 	gcDrainFlushBgCredit
 	gcDrainIdle
+	gcDrainFractional
 
 	// gcDrainBlock means neither gcDrainUntilPreempt or
 	// gcDrainNoBlock. It is the default, but callers should use
@@ -733,6 +734,10 @@ const (
 // If flags&gcDrainIdle != 0, gcDrain returns when there is other work
 // to do. This implies gcDrainNoBlock.
 //
+// If flags&gcDrainFractional != 0, gcDrain self-preempts when
+// pollFractionalWorkerExit() returns true. This implies
+// gcDrainNoBlock.
+//
 // If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
 // unable to get more work. Otherwise, it will block until all
 // blocking calls are blocked in gcDrain.
@@ -749,14 +754,24 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	gp := getg().m.curg
 	preemptible := flags&gcDrainUntilPreempt != 0
-	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainFractional|gcDrainNoBlock) == 0
 	flushBgCredit := flags&gcDrainFlushBgCredit != 0
 	idle := flags&gcDrainIdle != 0
 
 	initScanWork := gcw.scanWork
-	// idleCheck is the scan work at which to perform the next
-	// idle check with the scheduler.
-	idleCheck := initScanWork + idleCheckThreshold
+
+	// checkWork is the scan work before performing the next
+	// self-preempt check.
+	checkWork := int64(1<<63 - 1)
+	var check func() bool
+	if flags&(gcDrainIdle|gcDrainFractional) != 0 {
+		checkWork = initScanWork + drainCheckThreshold
+		if idle {
+			check = pollWork
+		} else if flags&gcDrainFractional != 0 {
+			check = pollFractionalWorkerExit
+		}
+	}
 
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
@@ -766,7 +781,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				break
 			}
 			markroot(gcw, job)
-			if idle && pollWork() {
+			if check != nil && check() {
 				goto done
 			}
 		}
@@ -807,12 +822,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				gcFlushBgCredit(gcw.scanWork - initScanWork)
 				initScanWork = 0
 			}
-			idleCheck -= gcw.scanWork
+			checkWork -= gcw.scanWork
 			gcw.scanWork = 0
 
-			if idle && idleCheck <= 0 {
-				idleCheck += idleCheckThreshold
-				if pollWork() {
+			if checkWork <= 0 {
+				checkWork += drainCheckThreshold
+				if check != nil && check() {
 					break
 				}
 			}
@@ -1091,6 +1106,9 @@ func shade(b uintptr) {
 // obj is the start of an object with mark mbits.
 // If it isn't already marked, mark it and enqueue into gcw.
 // base and off are for debugging only and could be removed.
+//
+// See also wbBufFlush1, which partially duplicates this logic.
+//
 //go:nowritebarrierrec
 func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr, forStack bool) {
 	// obj should be start of allocation, and so must be at least pointer-aligned.
@@ -1249,10 +1267,7 @@ func gcmarknewobject(obj, size, scanSize uintptr) {
 //
 // The world must be stopped.
 func gcMarkTinyAllocs() {
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil || c.tiny == 0 {
 			continue
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index 461679b..c6634fc 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -85,6 +85,13 @@ type gcWork struct {
 	scanWork int64
 }
 
+// Most of the methods of gcWork are go:nowritebarrierrec because the
+// write barrier itself can invoke gcWork methods but the methods are
+// not generally re-entrant. Hence, if a gcWork method invoked the
+// write barrier while the gcWork was in an inconsistent state, and
+// the write barrier in turn invoked a gcWork method, it could
+// permanently corrupt the gcWork.
+
 func (w *gcWork) init() {
 	w.wbuf1 = getempty()
 	wbuf2 := trygetfull()
@@ -96,7 +103,7 @@ func (w *gcWork) init() {
 
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
 	flushed := false
 	wbuf := w.wbuf1
@@ -129,7 +136,7 @@ func (w *gcWork) put(obj uintptr) {
 
 // putFast does a put and returns true if it can be done quickly
 // otherwise it returns false and the caller needs to call put.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -143,12 +150,45 @@ func (w *gcWork) putFast(obj uintptr) bool {
 	return true
 }
 
+// putBatch performs a put on every pointer in obj. See put for
+// constraints on these pointers.
+//
+//go:nowritebarrierrec
+func (w *gcWork) putBatch(obj []uintptr) {
+	if len(obj) == 0 {
+		return
+	}
+
+	flushed := false
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+	}
+
+	for len(obj) > 0 {
+		for wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
+			wbuf = w.wbuf1
+			flushed = true
+		}
+		n := copy(wbuf.obj[wbuf.nobj:], obj)
+		wbuf.nobj += n
+		obj = obj[n:]
+	}
+
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
 // tryGet dequeues a pointer for the garbage collector to trace.
 //
 // If there are no pointers remaining in this gcWork or in the global
 // queue, tryGet returns 0.  Note that there may still be pointers in
 // other gcWork instances or other caches.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGet() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -177,7 +217,7 @@ func (w *gcWork) tryGet() uintptr {
 // tryGetFast dequeues a pointer for the garbage collector to trace
 // if one is readily available. Otherwise it returns 0 and
 // the caller is expected to call tryGet().
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGetFast() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -194,7 +234,7 @@ func (w *gcWork) tryGetFast() uintptr {
 // get dequeues a pointer for the garbage collector to trace, blocking
 // if necessary to ensure all pointers from all queues and caches have
 // been retrieved.  get returns 0 if there are no pointers remaining.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) get() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -228,7 +268,7 @@ func (w *gcWork) get() uintptr {
 // GC can inspect them. This helps reduce the mutator's
 // ability to hide pointers during the concurrent mark phase.
 //
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) dispose() {
 	if wbuf := w.wbuf1; wbuf != nil {
 		if wbuf.nobj == 0 {
@@ -262,7 +302,7 @@ func (w *gcWork) dispose() {
 
 // balance moves some work that's cached in this gcWork back on the
 // global queue.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) balance() {
 	if w.wbuf1 == nil {
 		return
@@ -282,7 +322,7 @@ func (w *gcWork) balance() {
 }
 
 // empty returns true if w has no mark work available.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) empty() bool {
 	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
 }
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index 8749f97..d971bfe 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -56,6 +56,12 @@ type mheap struct {
 	// Internal pages map to an arbitrary span.
 	// For pages that have never been allocated, spans entries are nil.
 	//
+	// Modifications are protected by mheap.lock. Reads can be
+	// performed without locking, but ONLY from indexes that are
+	// known to contain in-use or stack spans. This means there
+	// must not be a safe-point between establishing that an
+	// address is live and looking it up in the spans array.
+	//
 	// This is backed by a reserved region of the address space so
 	// it can grow without moving. The memory up to len(spans) is
 	// mapped. cap(spans) indicates the total reserved memory.
@@ -154,6 +160,8 @@ type mheap struct {
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
+
+	unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
 }
 
 var mheap_ mheap
@@ -311,6 +319,17 @@ func (s *mspan) layout() (size, n, total uintptr) {
 	return
 }
 
+// recordspan adds a newly allocated span to h.allspans.
+//
+// This only happens the first time a span is allocated from
+// mheap.spanalloc (it is not called when a span is reused).
+//
+// Write barriers are disallowed here because it can be called from
+// gcWork when allocating new workbufs. However, because it's an
+// indirect call from the fixalloc initializer, the compiler can't see
+// this.
+//
+//go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
@@ -320,8 +339,8 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 			n = cap(h.allspans) * 3 / 2
 		}
 		var new []*mspan
-		sp := (*slice)(unsafe.Pointer(&new))
-		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
+		sp := (*notInHeapSlice)(unsafe.Pointer(&new))
+		sp.array = (*notInHeap)(sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys))
 		if sp.array == nil {
 			throw("runtime: cannot allocate memory")
 		}
@@ -331,12 +350,13 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 			copy(new, h.allspans)
 		}
 		oldAllspans := h.allspans
-		h.allspans = new
+		*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
 		if len(oldAllspans) != 0 {
 			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
 		}
 	}
-	h.allspans = append(h.allspans, s)
+	h.allspans = h.allspans[:len(h.allspans)+1]
+	h.allspans[len(h.allspans)-1] = s
 }
 
 // A spanClass represents the size class and noscan-ness of a span.
@@ -854,7 +874,7 @@ HaveSpan:
 // Large spans have a minimum size of 1MByte. The maximum number of large spans to support
 // 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
 // the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
-// by a perfectly balanced tree with a a depth of 20. Twice that is an acceptable 40.
+// by a perfectly balanced tree with a depth of 20. Twice that is an acceptable 40.
 func (h *mheap) isLargeSpan(npages uintptr) bool {
 	return npages >= uintptr(len(h.free))
 }
@@ -1120,34 +1140,35 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 
 	var sumreleased uintptr
 	for s := list.first; s != nil; s = s.next {
-		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-			start := s.base()
-			end := start + s.npages<<_PageShift
-			if physPageSize > _PageSize {
-				// We can only release pages in
-				// physPageSize blocks, so round start
-				// and end in. (Otherwise, madvise
-				// will round them *out* and release
-				// more memory than we want.)
-				start = (start + physPageSize - 1) &^ (physPageSize - 1)
-				end &^= physPageSize - 1
-				if end <= start {
-					// start and end don't span a
-					// whole physical page.
-					continue
-				}
-			}
-			len := end - start
-
-			released := len - (s.npreleased << _PageShift)
-			if physPageSize > _PageSize && released == 0 {
+		if (now-uint64(s.unusedsince)) <= limit || s.npreleased == s.npages {
+			continue
+		}
+		start := s.base()
+		end := start + s.npages<<_PageShift
+		if physPageSize > _PageSize {
+			// We can only release pages in
+			// physPageSize blocks, so round start
+			// and end in. (Otherwise, madvise
+			// will round them *out* and release
+			// more memory than we want.)
+			start = (start + physPageSize - 1) &^ (physPageSize - 1)
+			end &^= physPageSize - 1
+			if end <= start {
+				// start and end don't span a
+				// whole physical page.
 				continue
 			}
-			memstats.heap_released += uint64(released)
-			sumreleased += released
-			s.npreleased = len >> _PageShift
-			sysUnused(unsafe.Pointer(start), len)
 		}
+		len := end - start
+
+		released := len - (s.npreleased << _PageShift)
+		if physPageSize > _PageSize && released == 0 {
+			continue
+		}
+		memstats.heap_released += uint64(released)
+		sumreleased += released
+		s.npreleased = len >> _PageShift
+		sysUnused(unsafe.Pointer(start), len)
 	}
 	return sumreleased
 }
diff --git a/libgo/go/runtime/mksizeclasses.go b/libgo/go/runtime/mksizeclasses.go
index 0cb2b33..b146dbc 100644
--- a/libgo/go/runtime/mksizeclasses.go
+++ b/libgo/go/runtime/mksizeclasses.go
@@ -24,8 +24,8 @@
 // In practice, only one of the wastes comes into play for a
 // given size (sizes < 512 waste mainly on the round-up,
 // sizes > 512 waste mainly on the page chopping).
-//
-// TODO(rsc): Compute max waste for any given size.
+// For really small sizes, alignment constraints force the
+// overhead higher.
 
 package main
 
@@ -242,15 +242,18 @@ nextk:
 }
 
 func printComment(w io.Writer, classes []class) {
-	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-11s\n", "class", "bytes/obj", "bytes/span", "objects", "waste bytes")
+	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste")
+	prevSize := 0
 	for i, c := range classes {
 		if i == 0 {
 			continue
 		}
 		spanSize := c.npages * pageSize
 		objects := spanSize / c.size
-		waste := spanSize - c.size*(spanSize/c.size)
-		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %11d\n", i, c.size, spanSize, objects, waste)
+		tailWaste := spanSize - c.size*(spanSize/c.size)
+		maxWaste := float64((c.size-prevSize-1)*objects+tailWaste) / float64(spanSize)
+		prevSize = c.size
+		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste)
 	}
 	fmt.Fprintf(w, "\n")
 }
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 71dc223..22f5195 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -589,12 +589,13 @@ func updatememstats() {
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
+// cachestats flushes all mcache stats.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func cachestats() {
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
@@ -610,9 +611,6 @@ func cachestats() {
 //go:nowritebarrier
 func flushmcache(i int) {
 	p := allp[i]
-	if p == nil {
-		return
-	}
 	c := p.mcache
 	if c == nil {
 		return
@@ -665,7 +663,7 @@ func purgecachedstats(c *mcache) {
 // overflow errors.
 //go:nosplit
 func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, int64(n))
 		return
 	}
@@ -679,7 +677,7 @@ func mSysStatInc(sysStat *uint64, n uintptr) {
 // mSysStatInc apply.
 //go:nosplit
 func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, -int64(n))
 		return
 	}
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
new file mode 100644
index 0000000..a060df8b
--- /dev/null
+++ b/libgo/go/runtime/mwbbuf.go
@@ -0,0 +1,248 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This implements the write barrier buffer. The write barrier itself
+// is gcWriteBarrier and is implemented in assembly.
+//
+// The write barrier has a fast path and a slow path. The fast path
+// simply enqueues to a per-P write barrier buffer. It's written in
+// assembly and doesn't clobber any general purpose registers, so it
+// doesn't have the usual overheads of a Go call.
+//
+// When the buffer fills up, the write barrier invokes the slow path
+// (wbBufFlush) to flush the buffer to the GC work queues. In this
+// path, since the compiler didn't spill registers, we spill *all*
+// registers and disallow any GC safe points that could observe the
+// stack frame (since we don't know the types of the spilled
+// registers).
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// testSmallBuf forces a small write barrier buffer to stress write
+// barrier flushing.
+const testSmallBuf = false
+
+// wbBuf is a per-P buffer of pointers queued by the write barrier.
+// This buffer is flushed to the GC workbufs when it fills up and on
+// various GC transitions.
+//
+// This is closely related to a "sequential store buffer" (SSB),
+// except that SSBs are usually used for maintaining remembered sets,
+// while this is used for marking.
+type wbBuf struct {
+	// next points to the next slot in buf. It must not be a
+	// pointer type because it can point past the end of buf and
+	// must be updated without write barriers.
+	//
+	// This is a pointer rather than an index to optimize the
+	// write barrier assembly.
+	next uintptr
+
+	// end points to just past the end of buf. It must not be a
+	// pointer type because it points past the end of buf and must
+	// be updated without write barriers.
+	end uintptr
+
+	// buf stores a series of pointers to execute write barriers
+	// on. This must be a multiple of wbBufEntryPointers because
+	// the write barrier only checks for overflow once per entry.
+	buf [wbBufEntryPointers * wbBufEntries]uintptr
+}
+
+const (
+	// wbBufEntries is the number of write barriers between
+	// flushes of the write barrier buffer.
+	//
+	// This trades latency for throughput amortization. Higher
+	// values amortize flushing overhead more, but increase the
+	// latency of flushing. Higher values also increase the cache
+	// footprint of the buffer.
+	//
+	// TODO: What is the latency cost of this? Tune this value.
+	wbBufEntries = 256
+
+	// wbBufEntryPointers is the number of pointers added to the
+	// buffer by each write barrier.
+	wbBufEntryPointers = 2
+)
+
+// reset empties b by resetting its next and end pointers.
+func (b *wbBuf) reset() {
+	start := uintptr(unsafe.Pointer(&b.buf[0]))
+	b.next = start
+	if gcBlackenPromptly || writeBarrier.cgo {
+		// Effectively disable the buffer by forcing a flush
+		// on every barrier.
+		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+	} else if testSmallBuf {
+		// For testing, allow two barriers in the buffer. If
+		// we only did one, then barriers of non-heap pointers
+		// would be no-ops. This lets us combine a buffered
+		// barrier with a flush at a later time.
+		b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
+	} else {
+		b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
+	}
+
+	if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
+		throw("bad write barrier buffer bounds")
+	}
+}
+
+// putFast adds old and new to the write barrier buffer and returns
+// false if a flush is necessary. Callers should use this as:
+//
+//     buf := &getg().m.p.ptr().wbBuf
+//     if !buf.putFast(old, new) {
+//         wbBufFlush(...)
+//     }
+//
+// The arguments to wbBufFlush depend on whether the caller is doing
+// its own cgo pointer checks. If it is, then this can be
+// wbBufFlush(nil, 0). Otherwise, it must pass the slot address and
+// new.
+//
+// Since buf is a per-P resource, the caller must ensure there are no
+// preemption points while buf is in use.
+//
+// It must be nowritebarrierrec to because write barriers here would
+// corrupt the write barrier buffer. It (and everything it calls, if
+// it called anything) has to be nosplit to avoid scheduling on to a
+// different P and a different buffer.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) putFast(old, new uintptr) bool {
+	p := (*[2]uintptr)(unsafe.Pointer(b.next))
+	p[0] = old
+	p[1] = new
+	b.next += 2 * sys.PtrSize
+	return b.next != b.end
+}
+
+// wbBufFlush flushes the current P's write barrier buffer to the GC
+// workbufs. It is passed the slot and value of the write barrier that
+// caused the flush so that it can implement cgocheck.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation.
+//
+// This and everything it calls must be nosplit because 1) the stack
+// contains untyped slots from gcWriteBarrier and 2) there must not be
+// a GC safe point between the write barrier test in the caller and
+// flushing the buffer.
+//
+// TODO: A "go:nosplitrec" annotation would be perfect for this.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func wbBufFlush(dst *uintptr, src uintptr) {
+	if getg().m.dying > 0 {
+		// We're going down. Not much point in write barriers
+		// and this way we can allow write barriers in the
+		// panic path.
+		return
+	}
+
+	if writeBarrier.cgo && dst != nil {
+		// This must be called from the stack that did the
+		// write. It's nosplit all the way down.
+		cgoCheckWriteBarrier(dst, src)
+		if !writeBarrier.needed {
+			// We were only called for cgocheck.
+			b := &getg().m.p.ptr().wbBuf
+			b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+			return
+		}
+	}
+
+	// Switch to the system stack so we don't have to worry about
+	// the untyped stack slots or safe points.
+	systemstack(func() {
+		wbBufFlush1(getg().m.p.ptr())
+	})
+}
+
+// wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation, so this may lead to infinite loops or
+// buffer corruption.
+//
+// This must be non-preemptible because it uses the P's workbuf.
+//
+//go:nowritebarrierrec
+//go:systemstack
+func wbBufFlush1(_p_ *p) {
+	// Get the buffered pointers.
+	start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
+	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
+	ptrs := _p_.wbBuf.buf[:n]
+
+	// Reset the buffer.
+	_p_.wbBuf.reset()
+
+	if useCheckmark {
+		// Slow path for checkmark mode.
+		for _, ptr := range ptrs {
+			shade(ptr)
+		}
+		return
+	}
+
+	// Mark all of the pointers in the buffer and record only the
+	// pointers we greyed. We use the buffer itself to temporarily
+	// record greyed pointers.
+	//
+	// TODO: Should scanobject/scanblock just stuff pointers into
+	// the wbBuf? Then this would become the sole greying path.
+	gcw := &_p_.gcw
+	pos := 0
+	arenaStart := mheap_.arena_start
+	for _, ptr := range ptrs {
+		if ptr < arenaStart {
+			// nil pointers are very common, especially
+			// for the "old" values. Filter out these and
+			// other "obvious" non-heap pointers ASAP.
+			//
+			// TODO: Should we filter out nils in the fast
+			// path to reduce the rate of flushes?
+			continue
+		}
+		// TODO: This doesn't use hbits, so calling
+		// heapBitsForObject seems a little silly. We could
+		// easily separate this out since heapBitsForObject
+		// just calls heapBitsForAddr(obj) to get hbits.
+		obj, _, span, objIndex := heapBitsForObject(ptr, 0, 0, false)
+		if obj == 0 {
+			continue
+		}
+		// TODO: Consider making two passes where the first
+		// just prefetches the mark bits.
+		mbits := span.markBitsForIndex(objIndex)
+		if mbits.isMarked() {
+			continue
+		}
+		mbits.setMarked()
+		if span.spanclass.noscan() {
+			gcw.bytesMarked += uint64(span.elemsize)
+			continue
+		}
+		ptrs[pos] = obj
+		pos++
+	}
+
+	// Enqueue the greyed objects.
+	gcw.putBatch(ptrs[:pos])
+	if gcphase == _GCmarktermination || gcBlackenPromptly {
+		// Ps aren't allowed to cache work during mark
+		// termination.
+		gcw.dispose()
+	}
+}
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index 47927fe..1f68eff 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -97,10 +97,23 @@ retry:
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 		var mode int32
-		if ev.filter == _EVFILT_READ {
+		switch ev.filter {
+		case _EVFILT_READ:
 			mode += 'r'
-		}
-		if ev.filter == _EVFILT_WRITE {
+
+			// On some systems when the read end of a pipe
+			// is closed the write end will not get a
+			// _EVFILT_WRITE event, but will get a
+			// _EVFILT_READ event with EV_EOF set.
+			// Note that setting 'w' here just means that we
+			// will wake up a goroutine waiting to write;
+			// that goroutine will try the write again,
+			// and the appropriate thing will happen based
+			// on what that write returns (success, EPIPE, EAGAIN).
+			if ev.flags&_EV_EOF != 0 {
+				mode += 'w'
+			}
+		case _EVFILT_WRITE:
 			mode += 'w'
 		}
 		if mode != 0 {
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index 79dafb0..134071f 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -47,7 +47,7 @@ func netpolldescriptor() uintptr {
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	if stdcall4(_CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0 {
-		return -int32(getlasterror())
+		return int32(getlasterror())
 	}
 	return 0
 }
diff --git a/libgo/go/runtime/os_freebsd.go b/libgo/go/runtime/os_freebsd.go
index a4d2886..8c3535b 100644
--- a/libgo/go/runtime/os_freebsd.go
+++ b/libgo/go/runtime/os_freebsd.go
@@ -16,6 +16,17 @@ type mOS struct {
 //extern _umtx_op
 func sys_umtx_op(addr *uint32, mode int32, val uint32, uaddr1 uinptr, ts *umtx_time) int32
 
+func getPageSize() uintptr {
+	mib := [2]uint32{_CTL_HW, _HW_PAGESIZE}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return uintptr(out)
+	}
+	return 0
+}
+
 // FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
 // thus the code is largely similar. See Linux implementation
 // and lock_futex.go for comments.
diff --git a/libgo/go/runtime/os_linux.go b/libgo/go/runtime/os_linux.go
index e1a6a30..816327e 100644
--- a/libgo/go/runtime/os_linux.go
+++ b/libgo/go/runtime/os_linux.go
@@ -106,45 +106,46 @@ func sysargs(argc int32, argv **byte) {
 
 	// now argv+n is auxv
 	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-	if sysauxv(auxv[:]) == 0 {
-		// In some situations we don't get a loader-provided
-		// auxv, such as when loaded as a library on Android.
-		// Fall back to /proc/self/auxv.
-		fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
-		if fd < 0 {
-			// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
-			// try using mincore to detect the physical page size.
-			// mincore should return EINVAL when address is not a multiple of system page size.
-			const size = 256 << 10 // size of memory region to allocate
-			p := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-			if uintptr(p) < 4096 {
-				return
-			}
-			var n uintptr
-			for n = 4 << 10; n < size; n <<= 1 {
-				err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
-				if err == 0 {
-					physPageSize = n
-					break
-				}
-			}
-			if physPageSize == 0 {
-				physPageSize = size
-			}
-			munmap(p, size)
+	if sysauxv(auxv[:]) != 0 {
+		return
+	}
+	// In some situations we don't get a loader-provided
+	// auxv, such as when loaded as a library on Android.
+	// Fall back to /proc/self/auxv.
+	fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
+	if fd < 0 {
+		// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
+		// try using mincore to detect the physical page size.
+		// mincore should return EINVAL when address is not a multiple of system page size.
+		const size = 256 << 10 // size of memory region to allocate
+		p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if err != 0 {
 			return
 		}
-		var buf [128]uintptr
-		n := read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
-		closefd(fd)
-		if n < 0 {
-			return
+		var n uintptr
+		for n = 4 << 10; n < size; n <<= 1 {
+			err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
+			if err == 0 {
+				physPageSize = n
+				break
+			}
+		}
+		if physPageSize == 0 {
+			physPageSize = size
 		}
-		// Make sure buf is terminated, even if we didn't read
-		// the whole file.
-		buf[len(buf)-2] = _AT_NULL
-		sysauxv(buf[:])
+		munmap(p, size)
+		return
+	}
+	var buf [128]uintptr
+	n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
+	closefd(fd)
+	if n < 0 {
+		return
 	}
+	// Make sure buf is terminated, even if we didn't read
+	// the whole file.
+	buf[len(buf)-2] = _AT_NULL
+	sysauxv(buf[:])
 }
 
 func sysauxv(auxv []uintptr) int {
diff --git a/libgo/go/runtime/os_linux_ppc64x.go b/libgo/go/runtime/os_linux_ppc64x.go
index b324344..d27902d 100644
--- a/libgo/go/runtime/os_linux_ppc64x.go
+++ b/libgo/go/runtime/os_linux_ppc64x.go
@@ -7,55 +7,22 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-)
+// For go:linkname
+import _ "unsafe"
 
-const (
-	// ISA level
-	// Go currently requires POWER5 as a minimum for ppc64, so we need
-	// to check for ISA 2.03 and beyond.
-	_PPC_FEATURE_POWER5_PLUS = 0x00020000 // ISA 2.03 (POWER5+)
-	_PPC_FEATURE_ARCH_2_05   = 0x00001000 // ISA 2.05 (POWER6)
-	_PPC_FEATURE_POWER6_EXT  = 0x00000200 // mffgpr/mftgpr extension (POWER6x)
-	_PPC_FEATURE_ARCH_2_06   = 0x00000100 // ISA 2.06 (POWER7)
-	_PPC_FEATURE2_ARCH_2_07  = 0x80000000 // ISA 2.07 (POWER8)
+// ppc64x doesn't have a 'cpuid' instruction equivalent and relies on
+// HWCAP/HWCAP2 bits for hardware capabilities.
 
-	// Standalone capabilities
-	_PPC_FEATURE_HAS_ALTIVEC = 0x10000000 // SIMD/Vector unit
-	_PPC_FEATURE_HAS_VSX     = 0x00000080 // Vector scalar unit
-)
-
-type facilities struct {
-	_         [sys.CacheLineSize]byte
-	isPOWER5x bool // ISA 2.03
-	isPOWER6  bool // ISA 2.05
-	isPOWER6x bool // ISA 2.05 + mffgpr/mftgpr extension
-	isPOWER7  bool // ISA 2.06
-	isPOWER8  bool // ISA 2.07
-	hasVMX    bool // Vector unit
-	hasVSX    bool // Vector scalar unit
-	_         [sys.CacheLineSize]byte
-}
-
-// cpu can be tested at runtime in go assembler code to check for
-// a certain ISA level or hardware capability, for example:
-//	  ·cpu+facilities_hasVSX(SB) for checking the availability of VSX
-//	  or
-//	  ·cpu+facilities_isPOWER7(SB) for checking if the processor implements
-//	  ISA 2.06 instructions.
-var cpu facilities
+//go:linkname cpu_hwcap internal/cpu.ppc64x_hwcap
+//go:linkname cpu_hwcap2 internal/cpu.ppc64x_hwcap2
+var cpu_hwcap uint
+var cpu_hwcap2 uint
 
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		cpu.isPOWER5x = val&_PPC_FEATURE_POWER5_PLUS != 0
-		cpu.isPOWER6 = val&_PPC_FEATURE_ARCH_2_05 != 0
-		cpu.isPOWER6x = val&_PPC_FEATURE_POWER6_EXT != 0
-		cpu.isPOWER7 = val&_PPC_FEATURE_ARCH_2_06 != 0
-		cpu.hasVMX = val&_PPC_FEATURE_HAS_ALTIVEC != 0
-		cpu.hasVSX = val&_PPC_FEATURE_HAS_VSX != 0
+		cpu_hwcap = uint(val)
 	case _AT_HWCAP2:
-		cpu.isPOWER8 = val&_PPC_FEATURE2_ARCH_2_07 != 0
+		cpu_hwcap2 = uint(val)
 	}
 }
diff --git a/libgo/go/runtime/os_netbsd.go b/libgo/go/runtime/os_netbsd.go
index 464ce88..81ebe76 100644
--- a/libgo/go/runtime/os_netbsd.go
+++ b/libgo/go/runtime/os_netbsd.go
@@ -15,7 +15,7 @@ type mOS struct {
 
 //go:noescape
 //extern lwp_park
-func lwp_park(abstime *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
+func lwp_park(ts int32, rel int32, abstime *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
 
 //go:noescape
 //extern lwp_unpark
@@ -31,10 +31,9 @@ func semasleep(ns int64) int32 {
 
 	// Compute sleep deadline.
 	var tsp *timespec
+	var ts timespec
 	if ns >= 0 {
-		var ts timespec
 		var nsec int32
-		ns += nanotime()
 		ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
 		ts.set_nsec(nsec)
 		tsp = &ts
@@ -50,9 +49,18 @@ func semasleep(ns int64) int32 {
 		}
 
 		// Sleep until unparked by semawakeup or timeout.
-		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.mos.waitsemacount), nil)
+		ret := lwp_park(_CLOCK_MONOTONIC, _TIMER_RELTIME, tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
 		if ret == _ETIMEDOUT {
 			return -1
+		} else if ret == _EINTR && ns >= 0 {
+			// Avoid sleeping forever if we keep getting
+			// interrupted (for example by the profiling
+			// timer). It would be if tsp upon return had the
+			// remaining time to sleep, but this is good enough.
+			var nsec int32
+			ns /= 2
+			ts.set_sec(timediv(ns, 1000000000, &nsec))
+			ts.set_nsec(nsec)
 		}
 	}
 }
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index c39a58d..5cc325f 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -170,7 +170,18 @@ func freedefer(d *_defer) {
 			unlock(&sched.deferlock)
 		})
 	}
-	*d = _defer{}
+
+	// These lines used to be simply `*d = _defer{}` but that
+	// started causing a nosplit stack overflow via typedmemmove.
+	d.link = nil
+	d.frame = nil
+	d.panicStack = nil
+	d._panic = nil
+	d.pfn = 0
+	d.arg = nil
+	d.retaddr = 0
+	d.makefunccanrecover = false
+
 	pp.deferpool = append(pp.deferpool, d)
 }
 
@@ -327,7 +338,7 @@ func unwindStack() {
 
 // Goexit terminates the goroutine that calls it. No other goroutine is affected.
 // Goexit runs all deferred calls before terminating the goroutine. Because Goexit
-// is not panic, however, any recover calls in those deferred functions will return nil.
+// is not a panic, any recover calls in those deferred functions will return nil.
 //
 // Calling Goexit from the main goroutine terminates that goroutine
 // without func main returning. Since func main has not returned,
@@ -599,7 +610,7 @@ func canrecover(retaddr uintptr) bool {
 	// caller starts with "runtime.", then we are permitted to
 	// call recover.
 	var locs [16]location
-	if callers(2, locs[:2]) < 2 {
+	if callers(1, locs[:2]) < 2 {
 		return false
 	}
 
@@ -619,7 +630,7 @@ func canrecover(retaddr uintptr) bool {
 	// reflect.makeFuncStub or reflect.ffi_callback called by FFI
 	// functions.  Then we check the caller of that function.
 
-	n := callers(3, locs[:])
+	n := callers(2, locs[:])
 	foundFFICallback := false
 	i := 0
 	for ; i < n; i++ {
@@ -822,6 +833,12 @@ var panicking uint32
 // so that two concurrent panics don't overlap their output.
 var paniclk mutex
 
+// startpanic_m implements unrecoverable panic.
+//
+// It can have write barriers because the write barrier explicitly
+// ignores writes once dying > 0.
+//
+//go:yeswritebarrierrec
 func startpanic() {
 	_g_ := getg()
 	// Uncomment when mheap_ is in Go.
@@ -860,7 +877,7 @@ func startpanic() {
 		exit(4)
 		fallthrough
 	default:
-		// Can't even print!  Just exit.
+		// Can't even print! Just exit.
 		exit(5)
 	}
 }
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index a57b69d..8a562e2 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -18,7 +18,7 @@
 // To add equivalent profiling support to a standalone program, add
 // code like the following to your main function:
 //
-//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile `file`")
+//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`")
 //    var memprofile = flag.String("memprofile", "", "write memory profile to `file`")
 //
 //    func main() {
@@ -319,7 +319,15 @@ func (p *Profile) WriteTo(w io.Writer, debug int) error {
 	p.mu.Unlock()
 
 	// Map order is non-deterministic; make output deterministic.
-	sort.Sort(stackProfile(all))
+	sort.Slice(all, func(i, j int) bool {
+		t, u := all[i], all[j]
+		for k := 0; k < len(t) && k < len(u); k++ {
+			if t[k] != u[k] {
+				return t[k] < u[k]
+			}
+		}
+		return len(t) < len(u)
+	})
 
 	return printCountProfile(w, debug, p.name, stackProfile(all))
 }
@@ -328,16 +336,6 @@ type stackProfile [][]uintptr
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
-func (x stackProfile) Swap(i, j int)         { x[i], x[j] = x[j], x[i] }
-func (x stackProfile) Less(i, j int) bool {
-	t, u := x[i], x[j]
-	for k := 0; k < len(t) && k < len(u); k++ {
-		if t[k] != u[k] {
-			return t[k] < u[k]
-		}
-	}
-	return len(t) < len(u)
-}
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -348,6 +346,41 @@ type countProfile interface {
 	Stack(i int) []uintptr
 }
 
+// printCountCycleProfile outputs block profile records (for block or mutex profiles)
+// as the pprof-proto format output. Translations from cycle count to time duration
+// are done because The proto expects count and time (nanoseconds) instead of count
+// and the number of cycles for block, contention profiles.
+func printCountCycleProfile(w io.Writer, countName, cycleName string, records []runtime.BlockProfileRecord) error {
+	// Output profile in protobuf form.
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, countName, "count")
+	b.pb.int64Opt(tagProfile_Period, 1)
+	b.pbValueType(tagProfile_SampleType, countName, "count")
+	b.pbValueType(tagProfile_SampleType, cycleName, "nanoseconds")
+
+	cpuGHz := float64(runtime_cyclesPerSecond()) / 1e9
+
+	values := []int64{0, 0}
+	var locs []uint64
+	for _, r := range records {
+		values[0] = int64(r.Count)
+		values[1] = int64(float64(r.Cycles) / cpuGHz) // to nanoseconds
+		locs = locs[:0]
+		for _, addr := range r.Stack() {
+			// For count profiles, all stack addresses are
+			// return PCs, which is what locForPC expects.
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
+			}
+			locs = append(locs, l)
+		}
+		b.pbSample(values, locs, nil)
+	}
+	b.build()
+	return nil
+}
+
 // printCountProfile prints a countProfile at the specified debug level.
 // The profile will be in compressed proto format unless debug is nonzero.
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
@@ -441,7 +474,7 @@ func printStackRecord(w io.Writer, stk []uintptr, allFrames bool) {
 
 		// Hide runtime.goexit and any runtime functions at the beginning.
 		// This is useful mainly for allocation traces.
-		skip := name == "runtime.goexit"
+		skip := name == "runtime.goexit" || name == "runtime.kickoff"
 		if !show {
 			switch {
 			case strings.HasPrefix(name, "runtime."):
@@ -490,6 +523,14 @@ func countHeap() int {
 
 // writeHeap writes the current runtime heap profile to w.
 func writeHeap(w io.Writer, debug int) error {
+	var memStats *runtime.MemStats
+	if debug != 0 {
+		// Read mem stats first, so that our other allocations
+		// do not appear in the statistics.
+		memStats = new(runtime.MemStats)
+		runtime.ReadMemStats(memStats)
+	}
+
 	// Find out how many records there are (MemProfile(nil, true)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
@@ -552,8 +593,7 @@ func writeHeap(w io.Writer, debug int) error {
 
 	// Print memstats information too.
 	// Pprof will ignore, but useful for people
-	s := new(runtime.MemStats)
-	runtime.ReadMemStats(s)
+	s := memStats
 	fmt.Fprintf(w, "\n# runtime.MemStats\n")
 	fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
 	fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
@@ -779,14 +819,14 @@ func writeBlock(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- contention:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	for i := range p {
@@ -823,14 +863,14 @@ func writeMutex(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- mutex:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 9e5e403..08a4f96 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -26,16 +26,18 @@ import (
 	"time"
 )
 
-func cpuHogger(f func() int, dur time.Duration) {
+func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
 	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
 	// otherwise TestCPUProfileMultithreaded can fail if only one
 	// thread is scheduled during the testing period.
 	t0 := time.Now()
+	accum := *y
 	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
-		f()
+		accum = f(accum)
 	}
+	*y = accum
 }
 
 var (
@@ -46,8 +48,8 @@ var (
 // The actual CPU hogging function.
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
-func cpuHog1() int {
-	foo := salt1
+func cpuHog1(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -58,8 +60,8 @@ func cpuHog1() int {
 	return foo
 }
 
-func cpuHog2() int {
-	foo := salt2
+func cpuHog2(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -72,7 +74,7 @@ func cpuHog2() int {
 
 func TestCPUProfile(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHog1"}, func(dur time.Duration) {
-		cpuHogger(cpuHog1, dur)
+		cpuHogger(cpuHog1, &salt1, dur)
 	})
 }
 
@@ -81,29 +83,29 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHog1", "pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 			c <- 1
 		}()
-		cpuHogger(cpuHog2, dur)
+		cpuHogger(cpuHog2, &salt2, dur)
 		<-c
 	})
 }
 
 func TestCPUProfileInlining(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, func(dur time.Duration) {
-		cpuHogger(inlinedCaller, dur)
+		cpuHogger(inlinedCaller, &salt1, dur)
 	})
 }
 
-func inlinedCaller() int {
-	inlinedCallee()
-	return 0
+func inlinedCaller(x int) int {
+	x = inlinedCallee(x)
+	return x
 }
 
-func inlinedCallee() {
+func inlinedCallee(x int) int {
 	// We could just use cpuHog1, but for loops prevent inlining
 	// right now. :(
-	foo := salt1
+	foo := x
 	i := 0
 loop:
 	if foo > 0 {
@@ -114,7 +116,7 @@ loop:
 	if i++; i < 1e5 {
 		goto loop
 	}
-	salt1 = foo
+	return foo
 }
 
 func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
@@ -177,9 +179,9 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 		}
 	}
 
-	if badOS[runtime.GOOS] {
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "netbsd", "solaris":
 		t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
-		return
 	}
 	// Ignore the failure if the tests are running in a QEMU-based emulator,
 	// QEMU is not perfect at emulating everything.
@@ -187,7 +189,6 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
 	if os.Getenv("IN_QEMU") == "1" {
 		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
-		return
 	}
 	t.FailNow()
 }
@@ -394,60 +395,108 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
-// Operating systems that are expected to fail the tests. See issue 13841.
-var badOS = map[string]bool{
-	"darwin":    true,
-	"netbsd":    true,
-	"plan9":     true,
-	"dragonfly": true,
-	"solaris":   true,
-}
-
 func TestBlockProfile(t *testing.T) {
 	t.Skip("lots of details are different for gccgo; FIXME")
 	type TestCase struct {
 		name string
 		f    func()
+		stk  []string
 		re   string
 	}
 	tests := [...]TestCase{
-		{"chan recv", blockChanRecv, `
+		{
+			name: "chan recv",
+			f:    blockChanRecv,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanRecv",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan send", blockChanSend, `
+		{
+			name: "chan send",
+			f:    blockChanSend,
+			stk: []string{
+				"runtime.chansend1",
+				"runtime/pprof.blockChanSend",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chansend1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanSend\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan close", blockChanClose, `
+		{
+			name: "chan close",
+			f:    blockChanClose,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanClose",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanClose\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select recv async", blockSelectRecvAsync, `
+		{
+			name: "select recv async",
+			f:    blockSelectRecvAsync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectRecvAsync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select send sync", blockSelectSendSync, `
+		{
+			name: "select send sync",
+			f:    blockSelectSendSync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectSendSync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"mutex", blockMutex, `
+		{
+			name: "mutex",
+			f:    blockMutex,
+			stk: []string{
+				"sync.(*Mutex).Lock",
+				"runtime/pprof.blockMutex",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9a-f]+	.*/src/sync/mutex\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockMutex\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"cond", blockCond, `
+		{
+			name: "cond",
+			f:    blockCond,
+			stk: []string{
+				"sync.(*Cond).Wait",
+				"runtime/pprof.blockCond",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9a-f]+	.*/src/sync/cond\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockCond\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
@@ -455,28 +504,84 @@ func TestBlockProfile(t *testing.T) {
 `},
 	}
 
+	// Generate block profile
 	runtime.SetBlockProfileRate(1)
 	defer runtime.SetBlockProfileRate(0)
 	for _, test := range tests {
 		test.f()
 	}
-	var w bytes.Buffer
-	Lookup("block").WriteTo(&w, 1)
-	prof := w.String()
 
-	if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
-		t.Fatalf("Bad profile header:\n%v", prof)
-	}
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 1)
+		prof := w.String()
 
-	if strings.HasSuffix(prof, "#\t0x0\n\n") {
-		t.Errorf("Useless 0 suffix:\n%v", prof)
+		if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
+			t.Fatalf("Bad profile header:\n%v", prof)
+		}
+
+		if strings.HasSuffix(prof, "#\t0x0\n\n") {
+			t.Errorf("Useless 0 suffix:\n%v", prof)
+		}
+
+		for _, test := range tests {
+			if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
+				t.Errorf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+			}
+		}
+	})
+
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, test := range tests {
+			if !containsStack(stks, test.stk) {
+				t.Errorf("No matching stack entry for %v, want %+v", test.name, test.stk)
+			}
+		}
+	})
+
+}
+
+func stacks(p *profile.Profile) (res [][]string) {
+	for _, s := range p.Sample {
+		var stk []string
+		for _, l := range s.Location {
+			for _, line := range l.Line {
+				stk = append(stk, line.Function.Name)
+			}
+		}
+		res = append(res, stk)
 	}
+	return res
+}
 
-	for _, test := range tests {
-		if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
-			t.Fatalf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+func containsStack(got [][]string, want []string) bool {
+	for _, stk := range got {
+		if len(stk) < len(want) {
+			continue
+		}
+		for i, f := range want {
+			if f != stk[i] {
+				break
+			}
+			if i == len(want)-1 {
+				return true
+			}
 		}
 	}
+	return false
 }
 
 const blockDelay = 10 * time.Millisecond
@@ -568,6 +673,8 @@ func blockCond() {
 }
 
 func TestMutexProfile(t *testing.T) {
+	// Generate mutex profile
+
 	old := runtime.SetMutexProfileFraction(1)
 	defer runtime.SetMutexProfileFraction(old)
 	if old != 0 {
@@ -576,39 +683,60 @@ func TestMutexProfile(t *testing.T) {
 
 	blockMutex()
 
-	var w bytes.Buffer
-	Lookup("mutex").WriteTo(&w, 1)
-	prof := w.String()
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 1)
+		prof := w.String()
+		t.Logf("received profile: %v", prof)
 
-	if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
-		t.Errorf("Bad profile header:\n%v", prof)
-	}
-	prof = strings.Trim(prof, "\n")
-	lines := strings.Split(prof, "\n")
-	// gccgo adds an extra line in the stack trace, not sure why.
-	if len(lines) < 6 {
-		t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
-	}
-	if len(lines) < 6 {
-		return
-	}
-	// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
-	r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
-	//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
-	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
-		t.Errorf("%q didn't match %q", lines[3], r2)
-	}
-	r3 := "^#.*pprof.\\$nested.*$"
-	match := false
-	for _, i := range []int{5, 6} {
-		if ok, _ := regexp.MatchString(r3, lines[i]); ok {
-			match = true
-			break
+		if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
+			t.Errorf("Bad profile header:\n%v", prof)
 		}
-	}
-	if !match {
-		t.Errorf("neither %q nor %q matched %q", lines[5], lines[6], r3)
-	}
+		prof = strings.Trim(prof, "\n")
+		lines := strings.Split(prof, "\n")
+		if len(lines) != 6 {
+			t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
+		}
+		if len(lines) < 6 {
+			return
+		}
+		// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
+		r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
+		//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
+		if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
+			t.Errorf("%q didn't match %q", lines[3], r2)
+		}
+		if runtime.Compiler != "gccgo" {
+			r3 := "^#.*pprof.blockMutex.*$"
+			if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
+				t.Errorf("%q didn't match %q", lines[5], r3)
+			}
+		}
+		t.Logf(prof)
+	})
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, want := range [][]string{
+			// {"sync.(*Mutex).Unlock", "pprof.blockMutex.func1"},
+			{"sync.Unlock.pN10_sync.Mutex", "pprof.$nested17"},
+		} {
+			if !containsStack(stks, want) {
+				t.Errorf("No matching stack entry for %+v", want)
+			}
+		}
+	})
 }
 
 func func1(c chan int) { <-c }
@@ -725,7 +853,7 @@ func TestEmptyCallStack(t *testing.T) {
 func TestCPUProfileLabel(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, func(dur time.Duration) {
 		Do(context.Background(), Labels("key", "value"), func(context.Context) {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 		})
 	})
 }
@@ -738,14 +866,15 @@ func TestLabelRace(t *testing.T) {
 		start := time.Now()
 		var wg sync.WaitGroup
 		for time.Since(start) < dur {
+			var salts [10]int
 			for i := 0; i < 10; i++ {
 				wg.Add(1)
-				go func() {
+				go func(j int) {
 					Do(context.Background(), Labels("key", "value"), func(context.Context) {
-						cpuHogger(cpuHog1, time.Millisecond)
+						cpuHogger(cpuHog1, &salts[j], time.Millisecond)
 					})
 					wg.Done()
-				}()
+				}(i)
 			}
 			wg.Wait()
 		}
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index 5e1d71c..793be44 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -202,7 +202,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	// the stack and we have return PCs anyway.
 	frames := runtime.CallersFrames([]uintptr{addr})
 	frame, more := frames.Next()
-	if frame.Function == "runtime.goexit" {
+	if frame.Function == "runtime.goexit" || frame.Function == "runtime.kickoff" {
 		// Short-circuit if we see runtime.goexit so the loop
 		// below doesn't allocate a useless empty location.
 		return 0
@@ -228,7 +228,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	start := b.pb.startMessage()
 	b.pb.uint64Opt(tagLocation_ID, id)
 	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
-	for frame.Function != "runtime.goexit" {
+	for frame.Function != "runtime.goexit" && frame.Function != "runtime.kickoff" {
 		// Write out each line in frame expansion.
 		funcID := uint64(b.funcs[frame.Function])
 		if funcID == 0 {
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index 4db726a..3da05ad 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -78,7 +78,7 @@ var debuglock mutex
 
 // The compiler emits calls to printlock and printunlock around
 // the multiple calls that implement a single Go print or println
-// statement. Some of the print helpers (printsp, for example)
+// statement. Some of the print helpers (printslice, for example)
 // call print recursively. There is also the problem of a crash
 // happening during the print routines and needing to acquire
 // the print lock to print information about the crash.
@@ -120,31 +120,31 @@ func gwrite(b []byte) {
 }
 
 func printsp() {
-	print(" ")
+	printstring(" ")
 }
 
 func printnl() {
-	print("\n")
+	printstring("\n")
 }
 
 func printbool(v bool) {
 	if v {
-		print("true")
+		printstring("true")
 	} else {
-		print("false")
+		printstring("false")
 	}
 }
 
 func printfloat(v float64) {
 	switch {
 	case v != v:
-		print("NaN")
+		printstring("NaN")
 		return
 	case v+v == v && v > 0:
-		print("+Inf")
+		printstring("+Inf")
 		return
 	case v+v == v && v < 0:
-		print("-Inf")
+		printstring("-Inf")
 		return
 	}
 
@@ -226,7 +226,7 @@ func printuint(v uint64) {
 
 func printint(v int64) {
 	if v < 0 {
-		print("-")
+		printstring("-")
 		v = -v
 	}
 	printuint(uint64(v))
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 345f57b..1ea4152 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -34,6 +34,7 @@ import (
 //go:linkname helpgc runtime.helpgc
 //go:linkname kickoff runtime.kickoff
 //go:linkname mstart1 runtime.mstart1
+//go:linkname mexit runtime.mexit
 //go:linkname globrunqput runtime.globrunqput
 //go:linkname pidleget runtime.pidleget
 
@@ -54,6 +55,7 @@ func getTraceback(me, gp *g)
 func gtraceback(*g)
 func _cgo_notify_runtime_init_done()
 func alreadyInCallers() bool
+func stackfree(*g)
 
 // Functions created by the compiler.
 //extern __go_init_main
@@ -138,6 +140,9 @@ var (
 // it is closed, meaning cgocallbackg can reliably receive from it.
 var main_init_done chan bool
 
+// mainStarted indicates that the main M has started.
+var mainStarted bool
+
 // runtimeInitTime is the nanotime() at which the runtime started.
 var runtimeInitTime int64
 
@@ -157,8 +162,8 @@ func main() {
 		maxstacksize = 250000000
 	}
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
+	// Allow newproc to start new Ms.
+	mainStarted = true
 
 	systemstack(func() {
 		newm(sysmon, nil)
@@ -184,8 +189,15 @@ func main() {
 		}
 	}()
 
+	// Record when the world started. Must be after runtime_init
+	// because nanotime on some platforms depends on startNano.
+	runtimeInitTime = nanotime()
+
 	main_init_done = make(chan bool)
 	if iscgo {
+		// Start the template thread in case we enter Go from
+		// a C-created thread and need to create a new thread.
+		startTemplateThread()
 		_cgo_notify_runtime_init_done()
 	}
 
@@ -269,9 +281,10 @@ func forcegchelper() {
 	}
 }
 
+//go:nosplit
+
 // Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
-//go:nosplit
 func Gosched() {
 	mcall(gosched_m)
 }
@@ -359,8 +372,8 @@ func releaseSudog(s *sudog) {
 	if s.elem != nil {
 		throw("runtime: sudog with non-nil elem")
 	}
-	if s.selectdone != nil {
-		throw("runtime: sudog with non-nil selectdone")
+	if s.isSelect {
+		throw("runtime: sudog with non-false isSelect")
 	}
 	if s.next != nil {
 		throw("runtime: sudog with non-nil next")
@@ -419,7 +432,7 @@ func funcPC(f interface{}) uintptr {
 
 func lockedOSThread() bool {
 	gp := getg()
-	return gp.lockedm != nil && gp.m.lockedg != nil
+	return gp.lockedm != 0 && gp.m.lockedg != 0
 }
 
 var (
@@ -479,13 +492,21 @@ func schedinit() {
 	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
 		procs = n
 	}
-	if procs > _MaxGomaxprocs {
-		procs = _MaxGomaxprocs
-	}
 	if procresize(procs) != nil {
 		throw("unknown runnable goroutine during bootstrap")
 	}
 
+	// For cgocheck > 1, we turn on the write barrier at all times
+	// and check all pointer writes. We can't do this until after
+	// procresize because the write barrier needs a P.
+	if debug.cgocheck > 1 {
+		writeBarrier.cgo = true
+		writeBarrier.enabled = true
+		for _, p := range allp {
+			p.wbBuf.reset()
+		}
+	}
+
 	if buildVersion == "" {
 		// Condition should never trigger. This code just serves
 		// to ensure runtime·buildVersion is kept in the resulting binary.
@@ -501,7 +522,7 @@ func dumpgstatus(gp *g) {
 
 func checkmcount() {
 	// sched lock is held
-	if sched.mcount > sched.maxmcount {
+	if mcount() > sched.maxmcount {
 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
 		throw("thread exhaustion")
 	}
@@ -515,15 +536,20 @@ func mcommoninit(mp *m) {
 		callers(1, mp.createstack[:])
 	}
 
-	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
-	if mp.fastrand == 0 {
-		mp.fastrand = 0x49f6428a
-	}
-
 	lock(&sched.lock)
-	mp.id = sched.mcount
-	sched.mcount++
+	if sched.mnext+1 < sched.mnext {
+		throw("runtime: thread ID overflow")
+	}
+	mp.id = sched.mnext
+	sched.mnext++
 	checkmcount()
+
+	mp.fastrand[0] = 1597334677 * uint32(mp.id)
+	mp.fastrand[1] = uint32(cputicks())
+	if mp.fastrand[0]|mp.fastrand[1] == 0 {
+		mp.fastrand[1] = 1
+	}
+
 	mpreinit(mp)
 
 	// Add to allm so garbage collector doesn't free g->m
@@ -735,8 +761,10 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		// _Grunning or _Grunning|_Gscan; either way,
 		// we own gp.gcscanvalid, so it's safe to read.
 		// gp.gcscanvalid must not be true when we are running.
-		print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-		throw("casgstatus")
+		systemstack(func() {
+			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
+			throw("casgstatus")
+		})
 	}
 
 	// See http://golang.org/cl/21503 for justification of the yield delay.
@@ -912,7 +940,7 @@ func stopTheWorld(reason string) {
 
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(false) })
 	// worldsema must be held over startTheWorldWithSema to ensure
 	// gomaxprocs cannot change while worldsema is held.
 	semrelease(&worldsema)
@@ -962,8 +990,7 @@ func stopTheWorldWithSema() {
 	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	sched.stopwait--
 	// try to retake all P's in Psyscall status
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
 			if trace.enabled {
@@ -1003,8 +1030,7 @@ func stopTheWorldWithSema() {
 	if sched.stopwait != 0 {
 		bad = "stopTheWorld: not stopped (stopwait != 0)"
 	} else {
-		for i := 0; i < int(gomaxprocs); i++ {
-			p := allp[i]
+		for _, p := range allp {
 			if p.status != _Pgcstop {
 				bad = "stopTheWorld: not stopped (status != _Pgcstop)"
 			}
@@ -1028,12 +1054,14 @@ func mhelpgc() {
 	_g_.m.helpgc = -1
 }
 
-func startTheWorldWithSema() {
+func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	_g_ := getg()
 
-	_g_.m.locks++        // disable preemption because it can be holding p in a local var
-	gp := netpoll(false) // non-blocking
-	injectglist(gp)
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+	if netpollinited() {
+		gp := netpoll(false) // non-blocking
+		injectglist(gp)
+	}
 	add := needaddgcproc()
 	lock(&sched.lock)
 
@@ -1068,6 +1096,12 @@ func startTheWorldWithSema() {
 		}
 	}
 
+	// Capture start-the-world time before doing clean-up tasks.
+	startTime := nanotime()
+	if emitTraceEvent {
+		traceGCSTWDone()
+	}
+
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
@@ -1086,6 +1120,8 @@ func startTheWorldWithSema() {
 		newm(mhelpgc, nil)
 	}
 	_g_.m.locks--
+
+	return startTime
 }
 
 // First function run by a new goroutine.
@@ -1116,15 +1152,13 @@ func kickoff() {
 			throw("no p in kickoff")
 		}
 	}
-
 	gp.param = nil
 
 	fv(param)
 	goexit1()
 }
 
-// This is called from mstart.
-func mstart1() {
+func mstart1(dummy int32) {
 	_g_ := getg()
 
 	if _g_ != _g_.m.g0 {
@@ -1137,12 +1171,7 @@ func mstart1() {
 	// prepare the thread to be able to handle the signals.
 	// For gccgo minit was called by C code.
 	if _g_.m == &m0 {
-		// Create an extra M for callbacks on threads not created by Go.
-		if iscgo && !cgoHasExtraM {
-			cgoHasExtraM = true
-			newextram()
-		}
-		initsig(false)
+		mstartm0()
 	}
 
 	if fn := _g_.m.mstartfn; fn != nil {
@@ -1159,6 +1188,114 @@ func mstart1() {
 	schedule()
 }
 
+// mstartm0 implements part of mstart1 that only runs on the m0.
+//
+// Write barriers are allowed here because we know the GC can't be
+// running yet, so they'll be no-ops.
+//
+//go:yeswritebarrierrec
+func mstartm0() {
+	// Create an extra M for callbacks on threads not created by Go.
+	if iscgo && !cgoHasExtraM {
+		cgoHasExtraM = true
+		newextram()
+	}
+	initsig(false)
+}
+
+// mexit tears down and exits the current thread.
+//
+// Don't call this directly to exit the thread, since it must run at
+// the top of the thread stack. Instead, use gogo(&_g_.m.g0.sched) to
+// unwind the stack to the point that exits the thread.
+//
+// It is entered with m.p != nil, so write barriers are allowed. It
+// will release the P before exiting.
+//
+//go:yeswritebarrierrec
+func mexit(osStack bool) {
+	g := getg()
+	m := g.m
+
+	if m == &m0 {
+		// This is the main thread. Just wedge it.
+		//
+		// On Linux, exiting the main thread puts the process
+		// into a non-waitable zombie state. On Plan 9,
+		// exiting the main thread unblocks wait even though
+		// other threads are still running. On Solaris we can
+		// neither exitThread nor return from mstart. Other
+		// bad things probably happen on other platforms.
+		//
+		// We could try to clean up this M more before wedging
+		// it, but that complicates signal handling.
+		handoffp(releasep())
+		lock(&sched.lock)
+		sched.nmfreed++
+		checkdead()
+		unlock(&sched.lock)
+		notesleep(&m.park)
+		throw("locked m0 woke up")
+	}
+
+	sigblock()
+	unminit()
+
+	// Free the gsignal stack.
+	if m.gsignal != nil {
+		stackfree(m.gsignal)
+	}
+
+	// Remove m from allm.
+	lock(&sched.lock)
+	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
+		if *pprev == m {
+			*pprev = m.alllink
+			goto found
+		}
+	}
+	throw("m not found in allm")
+found:
+	if !osStack {
+		// Delay reaping m until it's done with the stack.
+		//
+		// If this is using an OS stack, the OS will free it
+		// so there's no need for reaping.
+		atomic.Store(&m.freeWait, 1)
+		// Put m on the free list, though it will not be reaped until
+		// freeWait is 0. Note that the free list must not be linked
+		// through alllink because some functions walk allm without
+		// locking, so may be using alllink.
+		m.freelink = sched.freem
+		sched.freem = m
+	}
+	unlock(&sched.lock)
+
+	// Release the P.
+	handoffp(releasep())
+	// After this point we must not have write barriers.
+
+	// Invoke the deadlock detector. This must happen after
+	// handoffp because it may have started a new M to take our
+	// P's work.
+	lock(&sched.lock)
+	sched.nmfreed++
+	checkdead()
+	unlock(&sched.lock)
+
+	if osStack {
+		// Return from mstart and let the system thread
+		// library free the g0 stack and terminate the thread.
+		return
+	}
+
+	// mstart is the thread's entry point, so there's nothing to
+	// return to. Exit the thread directly. exitThread will clear
+	// m.freeWait when it's done with the stack and the m can be
+	// reaped.
+	exitThread(&m.freeWait)
+}
+
 // forEachP calls fn(p) for every P p when p reaches a GC safe point.
 // If a P is currently executing code, this will bring the P to a GC
 // safe point and execute fn on that P. If the P is not executing code
@@ -1182,7 +1319,7 @@ func forEachP(fn func(*p)) {
 	sched.safePointFn = fn
 
 	// Ask all Ps to run the safe point function.
-	for _, p := range allp[:gomaxprocs] {
+	for _, p := range allp {
 		if p != _p_ {
 			atomic.Store(&p.runSafePointFn, 1)
 		}
@@ -1210,8 +1347,7 @@ func forEachP(fn func(*p)) {
 
 	// Force Ps currently in _Psyscall into _Pidle and hand them
 	// off to induce safe point function execution.
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
 			if trace.enabled {
@@ -1240,8 +1376,7 @@ func forEachP(fn func(*p)) {
 	if sched.safePointWait != 0 {
 		throw("forEachP: not done")
 	}
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		if p.runSafePointFn != 0 {
 			throw("forEachP: P did not run fn")
 		}
@@ -1295,6 +1430,27 @@ func allocm(_p_ *p, fn func(), allocatestack bool) (mp *m, g0Stack unsafe.Pointe
 	if _g_.m.p == 0 {
 		acquirep(_p_) // temporarily borrow p for mallocs in this function
 	}
+
+	// Release the free M list. We need to do this somewhere and
+	// this may free up a stack we can use.
+	if sched.freem != nil {
+		lock(&sched.lock)
+		var newList *m
+		for freem := sched.freem; freem != nil; {
+			if freem.freeWait != 0 {
+				next := freem.freelink
+				freem.freelink = newList
+				newList = freem
+				freem = next
+				continue
+			}
+			stackfree(freem.g0)
+			freem = freem.freelink
+		}
+		sched.freem = newList
+		unlock(&sched.lock)
+	}
+
 	mp = new(m)
 	mp.mstartfn = fn
 	mcommoninit(mp)
@@ -1431,9 +1587,9 @@ func oneNewExtraM() {
 	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
-	mp.locked = _LockInternal
-	mp.lockedg = gp
-	gp.lockedm = mp
+	mp.lockedInt++
+	mp.lockedg.set(gp)
+	gp.lockedm.set(mp)
 	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
 	// put on allg for garbage collector
 	allgadd(gp)
@@ -1574,6 +1730,27 @@ func unlockextra(mp *m) {
 // around exec'ing while creating/destroying threads.  See issue #19546.
 var execLock rwmutex
 
+// newmHandoff contains a list of m structures that need new OS threads.
+// This is used by newm in situations where newm itself can't safely
+// start an OS thread.
+var newmHandoff struct {
+	lock mutex
+
+	// newm points to a list of M structures that need new OS
+	// threads. The list is linked through m.schedlink.
+	newm muintptr
+
+	// waiting indicates that wake needs to be notified when an m
+	// is put on the list.
+	waiting bool
+	wake    note
+
+	// haveTemplateThread indicates that the templateThread has
+	// been started. This is not protected by lock. Use cas to set
+	// to 1.
+	haveTemplateThread uint32
+}
+
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
@@ -1582,11 +1759,90 @@ func newm(fn func(), _p_ *p) {
 	mp, _, _ := allocm(_p_, fn, false)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
+	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
+		// We're on a locked M or a thread that may have been
+		// started by C. The kernel state of this thread may
+		// be strange (the user may have locked it for that
+		// purpose). We don't want to clone that into another
+		// thread. Instead, ask a known-good thread to create
+		// the thread for us.
+		//
+		// This is disabled on Plan 9. See golang.org/issue/22227.
+		//
+		// TODO: This may be unnecessary on Windows, which
+		// doesn't model thread creation off fork.
+		lock(&newmHandoff.lock)
+		if newmHandoff.haveTemplateThread == 0 {
+			throw("on a locked thread with no template thread")
+		}
+		mp.schedlink = newmHandoff.newm
+		newmHandoff.newm.set(mp)
+		if newmHandoff.waiting {
+			newmHandoff.waiting = false
+			notewakeup(&newmHandoff.wake)
+		}
+		unlock(&newmHandoff.lock)
+		return
+	}
+	newm1(mp)
+}
+
+func newm1(mp *m) {
 	execLock.rlock() // Prevent process clone.
 	newosproc(mp)
 	execLock.runlock()
 }
 
+// startTemplateThread starts the template thread if it is not already
+// running.
+//
+// The calling thread must itself be in a known-good state.
+func startTemplateThread() {
+	if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+		return
+	}
+	newm(templateThread, nil)
+}
+
+// tmeplateThread is a thread in a known-good state that exists solely
+// to start new threads in known-good states when the calling thread
+// may not be a a good state.
+//
+// Many programs never need this, so templateThread is started lazily
+// when we first enter a state that might lead to running on a thread
+// in an unknown state.
+//
+// templateThread runs on an M without a P, so it must not have write
+// barriers.
+//
+//go:nowritebarrierrec
+func templateThread() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
+	for {
+		lock(&newmHandoff.lock)
+		for newmHandoff.newm != 0 {
+			newm := newmHandoff.newm.ptr()
+			newmHandoff.newm = 0
+			unlock(&newmHandoff.lock)
+			for newm != nil {
+				next := newm.schedlink.ptr()
+				newm.schedlink = 0
+				newm1(newm)
+				newm = next
+			}
+			lock(&newmHandoff.lock)
+		}
+		newmHandoff.waiting = true
+		noteclear(&newmHandoff.wake)
+		unlock(&newmHandoff.lock)
+		notesleep(&newmHandoff.wake)
+	}
+}
+
 // Stops execution of the current m until new work is available.
 // Returns with acquired P.
 func stopm() {
@@ -1609,7 +1865,9 @@ retry:
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
 	if _g_.m.helpgc != 0 {
+		// helpgc() set _g_.m.p and _g_.m.mcache, so we have a P.
 		gchelper()
+		// Undo the effects of helpgc().
 		_g_.m.helpgc = 0
 		_g_.m.mcache = nil
 		_g_.m.p = 0
@@ -1743,7 +2001,7 @@ func wakep() {
 func stoplockedm() {
 	_g_ := getg()
 
-	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+	if _g_.m.lockedg == 0 || _g_.m.lockedg.ptr().lockedm.ptr() != _g_.m {
 		throw("stoplockedm: inconsistent locking")
 	}
 	if _g_.m.p != 0 {
@@ -1755,7 +2013,7 @@ func stoplockedm() {
 	// Wait until another thread schedules lockedg again.
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
-	status := readgstatus(_g_.m.lockedg)
+	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
 		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
 		dumpgstatus(_g_)
@@ -1771,7 +2029,7 @@ func stoplockedm() {
 func startlockedm(gp *g) {
 	_g_ := getg()
 
-	mp := gp.lockedm
+	mp := gp.lockedm.ptr()
 	if mp == _g_.m {
 		throw("startlockedm: locked to me")
 	}
@@ -1896,11 +2154,12 @@ top:
 
 	// Poll network.
 	// This netpoll is only an optimization before we resort to stealing.
-	// We can safely skip it if there a thread blocked in netpoll already.
-	// If there is any kind of logical race with that blocked thread
-	// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
-	// this thread will do blocking netpoll below anyway.
-	if netpollinited() && sched.lastpoll != 0 {
+	// We can safely skip it if there are no waiters or a thread is blocked
+	// in netpoll already. If there is any kind of logical race with that
+	// blocked thread (e.g. it has already returned from netpoll, but does
+	// not set lastpoll yet), this thread will do blocking netpoll below
+	// anyway.
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
 		if gp := netpoll(false); gp != nil { // non-blocking
 			// netpoll returns list of goroutines linked by schedlink.
 			injectglist(gp.schedlink.ptr())
@@ -1996,9 +2255,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for i := 0; i < int(gomaxprocs); i++ {
-		_p_ := allp[i]
-		if _p_ != nil && !runqempty(_p_) {
+	for _, _p_ := range allp {
+		if !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2137,9 +2395,15 @@ func schedule() {
 		throw("schedule: holding locks")
 	}
 
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		stoplockedm()
-		execute(_g_.m.lockedg, false) // Never returns.
+		execute(_g_.m.lockedg.ptr(), false) // Never returns.
+	}
+
+	// We should not schedule away from a g that is executing a cgo call,
+	// since the cgo call is using the m's g0 stack.
+	if _g_.m.incgo {
+		throw("schedule: in cgo")
 	}
 
 top:
@@ -2205,7 +2469,7 @@ top:
 		resetspinning()
 	}
 
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
 		// then blocks waiting for a new p.
 		startlockedm(gp)
@@ -2322,8 +2586,9 @@ func goexit0(gp *g) {
 		gp.isSystemGoroutine = false
 	}
 	gp.m = nil
-	gp.lockedm = nil
-	_g_.m.lockedg = nil
+	locked := gp.lockedm != 0
+	gp.lockedm = 0
+	_g_.m.lockedg = 0
 	gp.entry = nil
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
@@ -2334,17 +2599,38 @@ func goexit0(gp *g) {
 	gp.labels = nil
 	gp.timer = nil
 
+	if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 {
+		// Flush assist credit to the global pool. This gives
+		// better information to pacing if the application is
+		// rapidly creating an exiting goroutines.
+		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
+		gp.gcAssistBytes = 0
+	}
+
 	// Note that gp's stack scan is now "valid" because it has no
 	// stack.
 	gp.gcscanvalid = true
 	dropg()
 
-	if _g_.m.locked&^_LockExternal != 0 {
-		print("invalid m->locked = ", _g_.m.locked, "\n")
+	if _g_.m.lockedInt != 0 {
+		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
 		throw("internal lockOSThread error")
 	}
-	_g_.m.locked = 0
+	_g_.m.lockedExt = 0
 	gfput(_g_.m.p.ptr(), gp)
+	if locked {
+		// The goroutine may have locked this thread because
+		// it put it in an unusual kernel state. Kill it
+		// rather than returning it to the thread pool.
+
+		// Return to mstart, which will release the P and exit
+		// the thread.
+		if GOOS != "plan9" { // See golang.org/issue/22227.
+			_g_.m.exiting = true
+			gogo(_g_.m.g0)
+		}
+	}
 	schedule()
 }
 
@@ -2481,7 +2767,9 @@ func exitsyscall(dummy int32) {
 	oldp := _g_.m.p.ptr()
 	if exitsyscallfast() {
 		if _g_.m.mcache == nil {
-			throw("lost mcache")
+			systemstack(func() {
+				throw("lost mcache")
+			})
 		}
 		if trace.enabled {
 			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
@@ -2519,7 +2807,9 @@ func exitsyscall(dummy int32) {
 	mcall(exitsyscall0)
 
 	if _g_.m.mcache == nil {
-		throw("lost mcache")
+		systemstack(func() {
+			throw("lost mcache")
+		})
 	}
 
 	// Scheduler returned, so we're allowed to run now.
@@ -2644,7 +2934,7 @@ func exitsyscall0(gp *g) {
 		acquirep(_p_)
 		execute(gp, false) // Never returns.
 	}
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		// Wait until another thread schedules gp and so m again.
 		stoplockedm()
 		execute(gp, false) // Never returns.
@@ -2798,7 +3088,7 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	newg.entry = entry
 
 	newg.param = arg
-	newg.gopc = getcallerpc(unsafe.Pointer(&fn))
+	newg.gopc = getcallerpc()
 	newg.startpc = fn
 	if _g_.m.curg != nil {
 		newg.labels = _g_.m.curg.labels
@@ -2827,7 +3117,7 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 
 	runqput(_p_, newg, true)
 
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && runtimeInitTime != 0 {
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && mainStarted {
 		wakep()
 	}
 	_g_.m.locks--
@@ -2947,23 +3237,41 @@ func Breakpoint() {
 //go:nosplit
 func dolockOSThread() {
 	_g_ := getg()
-	_g_.m.lockedg = _g_
-	_g_.lockedm = _g_.m
+	_g_.m.lockedg.set(_g_)
+	_g_.lockedm.set(_g_.m)
 }
 
 //go:nosplit
 
 // LockOSThread wires the calling goroutine to its current operating system thread.
-// Until the calling goroutine exits or calls UnlockOSThread, it will always
-// execute in that thread, and no other goroutine can.
+// The calling goroutine will always execute in that thread,
+// and no other goroutine will execute in it,
+// until the calling goroutine has made as many calls to
+// UnlockOSThread as to LockOSThread.
+// If the calling goroutine exits without unlocking the thread,
+// the thread will be terminated.
+//
+// A goroutine should call LockOSThread before calling OS services or
+// non-Go library functions that depend on per-thread state.
 func LockOSThread() {
-	getg().m.locked |= _LockExternal
+	if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" {
+		// If we need to start a new thread from the locked
+		// thread, we need the template thread. Start it now
+		// while we're in a known-good state.
+		startTemplateThread()
+	}
+	_g_ := getg()
+	_g_.m.lockedExt++
+	if _g_.m.lockedExt == 0 {
+		_g_.m.lockedExt--
+		panic("LockOSThread nesting overflow")
+	}
 	dolockOSThread()
 }
 
 //go:nosplit
 func lockOSThread() {
-	getg().m.locked += _LockInternal
+	getg().m.lockedInt++
 	dolockOSThread()
 }
 
@@ -2973,29 +3281,43 @@ func lockOSThread() {
 //go:nosplit
 func dounlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked != 0 {
+	if _g_.m.lockedInt != 0 || _g_.m.lockedExt != 0 {
 		return
 	}
-	_g_.m.lockedg = nil
-	_g_.lockedm = nil
+	_g_.m.lockedg = 0
+	_g_.lockedm = 0
 }
 
 //go:nosplit
 
-// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
-// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+// UnlockOSThread undoes an earlier call to LockOSThread.
+// If this drops the number of active LockOSThread calls on the
+// calling goroutine to zero, it unwires the calling goroutine from
+// its fixed operating system thread.
+// If there are no active LockOSThread calls, this is a no-op.
+//
+// Before calling UnlockOSThread, the caller must ensure that the OS
+// thread is suitable for running other goroutines. If the caller made
+// any permanent changes to the state of the thread that would affect
+// other goroutines, it should not call this function and thus leave
+// the goroutine locked to the OS thread until the goroutine (and
+// hence the thread) exits.
 func UnlockOSThread() {
-	getg().m.locked &^= _LockExternal
+	_g_ := getg()
+	if _g_.m.lockedExt == 0 {
+		return
+	}
+	_g_.m.lockedExt--
 	dounlockOSThread()
 }
 
 //go:nosplit
 func unlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked < _LockInternal {
+	if _g_.m.lockedInt == 0 {
 		systemstack(badunlockosthread)
 	}
-	_g_.m.locked -= _LockInternal
+	_g_.m.lockedInt--
 	dounlockOSThread()
 }
 
@@ -3005,10 +3327,7 @@ func badunlockosthread() {
 
 func gcount() int32 {
 	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
-	for _, _p_ := range &allp {
-		if _p_ == nil {
-			break
-		}
+	for _, _p_ := range allp {
 		n -= _p_.gfreecnt
 	}
 
@@ -3021,7 +3340,7 @@ func gcount() int32 {
 }
 
 func mcount() int32 {
-	return sched.mcount
+	return int32(sched.mnext - sched.nmfreed)
 }
 
 var prof struct {
@@ -3190,7 +3509,7 @@ func setcpuprofilerate(hz int32) {
 // Returns list of Ps with local work, they need to be scheduled by the caller.
 func procresize(nprocs int32) *p {
 	old := gomaxprocs
-	if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
+	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
 	}
 	if trace.enabled {
@@ -3204,6 +3523,23 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	// Grow allp if necessary.
+	if nprocs > int32(len(allp)) {
+		// Synchronize with retake, which could be running
+		// concurrently since it doesn't run on a P.
+		lock(&allpLock)
+		if nprocs <= int32(cap(allp)) {
+			allp = allp[:nprocs]
+		} else {
+			nallp := make([]*p, nprocs)
+			// Copy everything up to allp's cap so we
+			// never lose old allocated Ps.
+			copy(nallp, allp[:cap(allp)])
+			allp = nallp
+		}
+		unlock(&allpLock)
+	}
+
 	// initialize new P's
 	for i := int32(0); i < nprocs; i++ {
 		pp := allp[i]
@@ -3213,6 +3549,7 @@ func procresize(nprocs int32) *p {
 			pp.status = _Pgcstop
 			pp.sudogcache = pp.sudogbuf[:0]
 			pp.deferpool = pp.deferpoolbuf[:0]
+			pp.wbBuf.reset()
 			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
 		}
 		if pp.mcache == nil {
@@ -3230,13 +3567,11 @@ func procresize(nprocs int32) *p {
 	// free unused P's
 	for i := nprocs; i < old; i++ {
 		p := allp[i]
-		if trace.enabled {
-			if p == getg().m.p.ptr() {
-				// moving to p[0], pretend that we were descheduled
-				// and then scheduled again to keep the trace sane.
-				traceGoSched()
-				traceProcStop(p)
-			}
+		if trace.enabled && p == getg().m.p.ptr() {
+			// moving to p[0], pretend that we were descheduled
+			// and then scheduled again to keep the trace sane.
+			traceGoSched()
+			traceProcStop(p)
 		}
 		// move all runnable goroutines to the global queue
 		for p.runqhead != p.runqtail {
@@ -3262,6 +3597,11 @@ func procresize(nprocs int32) *p {
 			// world is stopped.
 			p.gcBgMarkWorker.set(nil)
 		}
+		// Flush p's write barrier buffer.
+		if gcphase != _GCoff {
+			wbBufFlush1(p)
+			p.gcw.dispose()
+		}
 		for i := range p.sudogbuf {
 			p.sudogbuf[i] = nil
 		}
@@ -3274,10 +3614,18 @@ func procresize(nprocs int32) *p {
 		p.mcache = nil
 		gfpurge(p)
 		traceProcFree(p)
+		p.gcAssistTime = 0
 		p.status = _Pdead
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
+	// Trim allp.
+	if int32(len(allp)) != nprocs {
+		lock(&allpLock)
+		allp = allp[:nprocs]
+		unlock(&allpLock)
+	}
+
 	_g_ := getg()
 	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
 		// continue to use the current P
@@ -3349,7 +3697,7 @@ func acquirep1(_p_ *p) {
 		throw("acquirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
-		id := int32(0)
+		id := int64(0)
 		if _p_.m != 0 {
 			id = _p_.m.ptr().id
 		}
@@ -3394,6 +3742,7 @@ func incidlelocked(v int32) {
 
 // Check for deadlock situation.
 // The check is based on number of running M's, if 0 -> deadlock.
+// sched.lock must be held.
 func checkdead() {
 	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
 	// there are no running goroutines. The calling program is
@@ -3410,13 +3759,12 @@ func checkdead() {
 		return
 	}
 
-	// -1 for sysmon
-	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+	run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
 	if run > 0 {
 		return
 	}
 	if run < 0 {
-		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
 		throw("checkdead: inconsistent counts")
 	}
 
@@ -3479,6 +3827,11 @@ var forcegcperiod int64 = 2 * 60 * 1e9
 //
 //go:nowritebarrierrec
 func sysmon() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
 	// If a heap span goes unused for 5 minutes after a garbage collection,
 	// we hand it back to the operating system.
 	scavengelimit := int64(5 * 60 * 1e9)
@@ -3518,15 +3871,11 @@ func sysmon() {
 				}
 				shouldRelax := true
 				if osRelaxMinNS > 0 {
-					lock(&timers.lock)
-					if timers.sleeping {
-						now := nanotime()
-						next := timers.sleepUntil
-						if next-now < osRelaxMinNS {
-							shouldRelax = false
-						}
+					next := timeSleepUntil()
+					now := nanotime()
+					if next-now < osRelaxMinNS {
+						shouldRelax = false
 					}
-					unlock(&timers.lock)
 				}
 				if shouldRelax {
 					osRelax(true)
@@ -3550,7 +3899,7 @@ func sysmon() {
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
-		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
 			if gp != nil {
@@ -3607,9 +3956,17 @@ const forcePreemptNS = 10 * 1000 * 1000 // 10ms
 
 func retake(now int64) uint32 {
 	n := 0
-	for i := int32(0); i < gomaxprocs; i++ {
+	// Prevent allp slice changes. This lock will be completely
+	// uncontended unless we're already stopping the world.
+	lock(&allpLock)
+	// We can't use a range loop over allp because we may
+	// temporarily drop the allpLock. Hence, we need to re-fetch
+	// allp each time around the loop.
+	for i := 0; i < len(allp); i++ {
 		_p_ := allp[i]
 		if _p_ == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
 			continue
 		}
 		pd := &_p_.sysmontick
@@ -3628,6 +3985,8 @@ func retake(now int64) uint32 {
 			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
 				continue
 			}
+			// Drop allpLock so we can take sched.lock.
+			unlock(&allpLock)
 			// Need to decrement number of idle locked M's
 			// (pretending that one more is running) before the CAS.
 			// Otherwise the M from which we retake can exit the syscall,
@@ -3643,6 +4002,7 @@ func retake(now int64) uint32 {
 				handoffp(_p_)
 			}
 			incidlelocked(1)
+			lock(&allpLock)
 		} else if s == _Prunning {
 			// Preempt G if it's running for too long.
 			t := int64(_p_.schedtick)
@@ -3657,6 +4017,7 @@ func retake(now int64) uint32 {
 			preemptone(_p_)
 		}
 	}
+	unlock(&allpLock)
 	return uint32(n)
 }
 
@@ -3667,9 +4028,8 @@ func retake(now int64) uint32 {
 // Returns true if preemption request was issued to at least one goroutine.
 func preemptall() bool {
 	res := false
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil || _p_.status != _Prunning {
+	for _, _p_ := range allp {
+		if _p_.status != _Prunning {
 			continue
 		}
 		if preemptone(_p_) {
@@ -3727,23 +4087,19 @@ func schedtrace(detailed bool) {
 	}
 
 	lock(&sched.lock)
-	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
 	if detailed {
 		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
 	}
 	// We must be careful while reading data from P's, M's and G's.
 	// Even if we hold schedlock, most data can be changed concurrently.
 	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			continue
-		}
+	for i, _p_ := range allp {
 		mp := _p_.m.ptr()
 		h := atomic.Load(&_p_.runqhead)
 		t := atomic.Load(&_p_.runqtail)
 		if detailed {
-			id := int32(-1)
+			id := int64(-1)
 			if mp != nil {
 				id = mp.id
 			}
@@ -3756,7 +4112,7 @@ func schedtrace(detailed bool) {
 				print("[")
 			}
 			print(t - h)
-			if i == gomaxprocs-1 {
+			if i == len(allp)-1 {
 				print("]\n")
 			}
 		}
@@ -3770,7 +4126,7 @@ func schedtrace(detailed bool) {
 	for mp := allm; mp != nil; mp = mp.alllink {
 		_p_ := mp.p.ptr()
 		gp := mp.curg
-		lockedg := mp.lockedg
+		lockedg := mp.lockedg.ptr()
 		id1 := int32(-1)
 		if _p_ != nil {
 			id1 = _p_.id
@@ -3790,12 +4146,12 @@ func schedtrace(detailed bool) {
 	for gi := 0; gi < len(allgs); gi++ {
 		gp := allgs[gi]
 		mp := gp.m
-		lockedm := gp.lockedm
-		id1 := int32(-1)
+		lockedm := gp.lockedm.ptr()
+		id1 := int64(-1)
 		if mp != nil {
 			id1 = mp.id
 		}
-		id2 := int32(-1)
+		id2 := int64(-1)
 		if lockedm != nil {
 			id2 = lockedm.id
 		}
@@ -4077,22 +4433,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			if stealRunNextG {
 				// Try to steal from _p_.runnext.
 				if next := _p_.runnext; next != 0 {
-					// Sleep to ensure that _p_ isn't about to run the g we
-					// are about to steal.
-					// The important use case here is when the g running on _p_
-					// ready()s another g and then almost immediately blocks.
-					// Instead of stealing runnext in this window, back off
-					// to give _p_ a chance to schedule runnext. This will avoid
-					// thrashing gs between different Ps.
-					// A sync chan send/recv takes ~50ns as of time of writing,
-					// so 3us gives ~50x overshoot.
-					if GOOS != "windows" {
-						usleep(3)
-					} else {
-						// On windows system timer granularity is 1-15ms,
-						// which is way too much for this optimization.
-						// So just yield.
-						osyield()
+					if _p_.status == _Prunning {
+						// Sleep to ensure that _p_ isn't about to run the g
+						// we are about to steal.
+						// The important use case here is when the g running
+						// on _p_ ready()s another g and then almost
+						// immediately blocks. Instead of stealing runnext
+						// in this window, back off to give _p_ a chance to
+						// schedule runnext. This will avoid thrashing gs
+						// between different Ps.
+						// A sync chan send/recv takes ~50ns as of time of
+						// writing, so 3us gives ~50x overshoot.
+						if GOOS != "windows" {
+							usleep(3)
+						} else {
+							// On windows system timer granularity is
+							// 1-15ms, which is way too much for this
+							// optimization. So just yield.
+							osyield()
+						}
 					}
 					if !_p_.runnext.cas(next, 0) {
 						continue
diff --git a/libgo/go/runtime/proc_runtime_test.go b/libgo/go/runtime/proc_runtime_test.go
index d56f9b1..a7bde2c 100644
--- a/libgo/go/runtime/proc_runtime_test.go
+++ b/libgo/go/runtime/proc_runtime_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 // Proc unit tests. In runtime package so can use runtime guts.
 
 package runtime
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index 313a961..672e1fa 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -658,6 +658,116 @@ func BenchmarkClosureCall(b *testing.B) {
 	_ = sum
 }
 
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		b.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	wakeDelay := 5 * time.Microsecond
+	for _, delay := range []time.Duration{
+		0,
+		1 * time.Microsecond,
+		2 * time.Microsecond,
+		5 * time.Microsecond,
+		10 * time.Microsecond,
+		20 * time.Microsecond,
+		50 * time.Microsecond,
+		100 * time.Microsecond,
+	} {
+		b.Run(delay.String(), func(b *testing.B) {
+			if b.N == 0 {
+				return
+			}
+			// Start two goroutines, which alternate between being
+			// sender and receiver in the following protocol:
+			//
+			// - The receiver spins for `delay` and then does a
+			// blocking receive on a channel.
+			//
+			// - The sender spins for `delay+wakeDelay` and then
+			// sends to the same channel. (The addition of
+			// `wakeDelay` improves the probability that the
+			// receiver will be blocking when the send occurs when
+			// the goroutines execute in parallel.)
+			//
+			// In each iteration of the benchmark, each goroutine
+			// acts once as sender and once as receiver, so each
+			// goroutine spins for delay twice.
+			//
+			// BenchmarkWakeupParallel is used to estimate how
+			// efficiently the scheduler parallelizes goroutines in
+			// the presence of blocking:
+			//
+			// - If both goroutines are executed on the same core,
+			// an increase in delay by N will increase the time per
+			// iteration by 4*N, because all 4 delays are
+			// serialized.
+			//
+			// - Otherwise, an increase in delay by N will increase
+			// the time per iteration by 2*N, and the time per
+			// iteration is 2 * (runtime overhead + chan
+			// send/receive pair + delay + wakeDelay). This allows
+			// the runtime overhead, including the time it takes
+			// for the unblocked goroutine to be scheduled, to be
+			// estimated.
+			ping, pong := make(chan struct{}), make(chan struct{})
+			start := make(chan struct{})
+			done := make(chan struct{})
+			go func() {
+				<-start
+				for i := 0; i < b.N; i++ {
+					// sender
+					spin(delay + wakeDelay)
+					ping <- struct{}{}
+					// receiver
+					spin(delay)
+					<-pong
+				}
+				done <- struct{}{}
+			}()
+			go func() {
+				for i := 0; i < b.N; i++ {
+					// receiver
+					spin(delay)
+					<-ping
+					// sender
+					spin(delay + wakeDelay)
+					pong <- struct{}{}
+				}
+				done <- struct{}{}
+			}()
+			b.ResetTimer()
+			start <- struct{}{}
+			<-done
+			<-done
+		})
+	}
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		end := time.Now().Add(d)
+		for time.Now().Before(end) {
+			// do nothing
+		}
+	})
+}
+
+// sysNanosleep is defined by OS-specific files (such as runtime_linux_test.go)
+// to sleep for the given duration. If nil, dependent tests are skipped.
+// The implementation should invoke a blocking system call and not
+// call time.Sleep, which would deschedule the goroutine.
+var sysNanosleep func(d time.Duration)
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+	if sysNanosleep == nil {
+		b.Skipf("skipping on %v; sysNanosleep not defined", runtime.GOOS)
+	}
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		sysNanosleep(d)
+	})
+}
+
 type Matrix [][]float64
 
 func BenchmarkMatmult(b *testing.B) {
@@ -722,8 +832,47 @@ func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, thres
 	}
 }
 
-/*
 func TestStealOrder(t *testing.T) {
 	runtime.RunStealOrderTest()
 }
-*/
+
+func TestLockOSThreadNesting(t *testing.T) {
+	go func() {
+		e, i := runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.LockOSThread()
+		runtime.LockOSThread()
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 1 || i != 0 {
+			t.Errorf("want locked counts 1, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+	}()
+}
+
+func TestLockOSThreadExit(t *testing.T) {
+	testLockOSThreadExit(t, "testprog")
+}
+
+func testLockOSThreadExit(t *testing.T, prog string) {
+	output := runTestProg(t, prog, "LockOSThreadMain", "GOMAXPROCS=1")
+	want := "OK\n"
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+
+	output = runTestProg(t, prog, "LockOSThreadAlt")
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/libgo/go/runtime/runtime-lldb_test.go b/libgo/go/runtime/runtime-lldb_test.go
index 98bc906..9a28705 100644
--- a/libgo/go/runtime/runtime-lldb_test.go
+++ b/libgo/go/runtime/runtime-lldb_test.go
@@ -5,11 +5,7 @@
 package runtime_test
 
 import (
-	"debug/elf"
-	"debug/macho"
-	"encoding/binary"
 	"internal/testenv"
-	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -158,7 +154,7 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("failed to create file: %v", err)
 	}
 
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags", "-N -l", "-o", "a.exe")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
 	cmd.Dir = dir
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -182,81 +178,3 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("Unexpected lldb output:\n%s", got)
 	}
 }
-
-// Check that aranges are valid even when lldb isn't installed.
-func TestDwarfAranges(t *testing.T) {
-	testenv.MustHaveGoBuild(t)
-	dir, err := ioutil.TempDir("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
-	if err != nil {
-		t.Fatalf("failed to create file: %v", err)
-	}
-
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
-	cmd.Dir = dir
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("building source %v\n%s", err, out)
-	}
-
-	filename := filepath.Join(dir, "a.exe")
-	if f, err := elf.Open(filename); err == nil {
-		sect := f.Section(".debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else if f, err := macho.Open(filename); err == nil {
-		sect := f.Section("__debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else {
-		t.Skip("Not an elf or macho binary.")
-	}
-}
-
-func verifyAranges(t *testing.T, byteorder binary.ByteOrder, data io.ReadSeeker) {
-	var header struct {
-		UnitLength  uint32 // does not include the UnitLength field
-		Version     uint16
-		Offset      uint32
-		AddressSize uint8
-		SegmentSize uint8
-	}
-	for {
-		offset, err := data.Seek(0, io.SeekCurrent)
-		if err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		if err = binary.Read(data, byteorder, &header); err == io.EOF {
-			return
-		} else if err != nil {
-			t.Fatalf("Error reading arange header: %v", err)
-		}
-		tupleSize := int64(header.SegmentSize) + 2*int64(header.AddressSize)
-		lastTupleOffset := offset + int64(header.UnitLength) + 4 - tupleSize
-		if lastTupleOffset%tupleSize != 0 {
-			t.Fatalf("Invalid arange length %d, (addr %d, seg %d)", header.UnitLength, header.AddressSize, header.SegmentSize)
-		}
-		if _, err = data.Seek(lastTupleOffset, io.SeekStart); err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		buf := make([]byte, tupleSize)
-		if n, err := data.Read(buf); err != nil || int64(n) < tupleSize {
-			t.Fatalf("Read error: %v", err)
-		}
-		for _, val := range buf {
-			if val != 0 {
-				t.Fatalf("Invalid terminator")
-			}
-		}
-	}
-}
diff --git a/libgo/go/runtime/runtime.go b/libgo/go/runtime/runtime.go
index 58710de..d19d6afe 100644
--- a/libgo/go/runtime/runtime.go
+++ b/libgo/go/runtime/runtime.go
@@ -61,6 +61,12 @@ func syscall_Getpagesize() int { return int(physPageSize) }
 //go:linkname os_runtime_args os.runtime_args
 func os_runtime_args() []string { return append([]string{}, argslice...) }
 
+//go:linkname syscall_Exit syscall.Exit
+//go:nosplit
+func syscall_Exit(code int) {
+	exit(int32(code))
+}
+
 // Temporary, for the gccgo runtime code written in C.
 //go:linkname get_envs runtime_get_envs
 func get_envs() []string { return envs }
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 627adf7..b617f85 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -111,10 +111,6 @@ var test_z64, test_x64 uint64
 func testAtomic64() {
 	test_z64 = 42
 	test_x64 = 0
-	prefetcht0(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
-	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
 	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
@@ -413,13 +409,6 @@ func parsedebugvars() {
 
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
-
-	// For cgocheck > 1, we turn on the write barrier at all times
-	// and check all pointer writes.
-	if debug.cgocheck > 1 {
-		writeBarrier.cgo = true
-		writeBarrier.enabled = true
-	}
 }
 
 //go:linkname setTraceback runtime_debug.SetTraceback
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 045e76f..543086d 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -173,9 +173,13 @@ func efaceOf(ep *interface{}) *eface {
 // a word that is completely ignored by the GC than to have one for which
 // only a few updates are ignored.
 //
-// Gs, Ms, and Ps are always reachable via true pointers in the
-// allgs, allm, and allp lists or (during allocation before they reach those lists)
+// Gs and Ps are always reachable via true pointers in the
+// allgs and allp lists or (during allocation before they reach those lists)
 // from stack variables.
+//
+// Ms are always reachable via true pointers either from allm or
+// freem. Unlike Gs and Ps we do free Ms, so it's important that
+// nothing ever hold an muintptr across a safe point.
 
 // A guintptr holds a goroutine pointer, but typed as a uintptr
 // to bypass write barriers. It is used in the Gobuf goroutine state
@@ -225,6 +229,15 @@ func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
 //go:nosplit
 func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
 
+// muintptr is a *m that is not tracked by the garbage collector.
+//
+// Because we do free Ms, there are some additional constrains on
+// muintptrs:
+//
+// 1. Never hold an muintptr locally across a safe point.
+//
+// 2. Any muintptr in the heap must be owned by the M itself so it can
+//    ensure it is not in use when the last true *m is released.
 type muintptr uintptr
 
 //go:nosplit
@@ -256,11 +269,14 @@ type sudog struct {
 	// channel this sudog is blocking on. shrinkstack depends on
 	// this for sudogs involved in channel ops.
 
-	g          *g
-	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
-	next       *sudog
-	prev       *sudog
-	elem       unsafe.Pointer // data element (may point to stack)
+	g *g
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+	next     *sudog
+	prev     *sudog
+	elem     unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
 	// For channels, waitlink is only accessed by g.
@@ -351,7 +367,7 @@ type g struct {
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
 	tracelastp     puintptr // last P emitted an event for this goroutine
-	lockedm        *m
+	lockedm        muintptr
 	sig            uint32
 	writebuf       []byte
 	sigcode0       uintptr
@@ -362,8 +378,9 @@ type g struct {
 	// Not for gccgo: racectx        uintptr
 	waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
 	// Not for gccgo: cgoCtxt        []uintptr      // cgo traceback context
-	labels unsafe.Pointer // profiler labels
-	timer  *timer         // cached timer for time.Sleep
+	labels     unsafe.Pointer // profiler labels
+	timer      *timer         // cached timer for time.Sleep
+	selectDone uint32         // are we participating in a select and did someone win the race?
 
 	// Per-G GC state
 
@@ -381,13 +398,26 @@ type g struct {
 	exception unsafe.Pointer // current exception being thrown
 	isforeign bool           // whether current exception is not from Go
 
-	// Fields that hold stack and context information if status is Gsyscall
+	// When using split-stacks, these fields holds the results of
+	// __splitstack_find while executing a syscall. These are used
+	// by the garbage collector to scan the goroutine's stack.
+	//
+	// When not using split-stacks, g0 stacks are allocated by the
+	// libc and other goroutine stacks are allocated by malg.
+	// gcstack: unused (sometimes cleared)
+	// gcstacksize: g0: 0; others: size of stack
+	// gcnextsegment: unused
+	// gcnextsp: current SP while executing a syscall
+	// gcinitialsp: g0: top of stack; others: start of stack memory
 	gcstack       uintptr
 	gcstacksize   uintptr
 	gcnextsegment uintptr
 	gcnextsp      uintptr
 	gcinitialsp   unsafe.Pointer
-	gcregs        g_ucontext_t
+
+	// gcregs holds the register values while executing a syscall.
+	// This is set by getcontext and scanned by the garbage collector.
+	gcregs g_ucontext_t
 
 	entry    func(unsafe.Pointer) // goroutine function to run
 	entryfn  uintptr              // function address passed to __go_go
@@ -411,14 +441,15 @@ type m struct {
 	// Fields not known to debuggers.
 	procid  uint64 // for debuggers, but offset not hard-coded
 	gsignal *g     // signal-handling g
+	// Not for gccgo: goSigStack    gsignalStack // Go-allocated signal handling stack
 	sigmask sigset // storage for saved signal mask
-	// Not for gccgo: tls           [6]uintptr // thread-local storage (for x86 extern register)
+	// Not for gccgo: tls           [6]uintptr   // thread-local storage (for x86 extern register)
 	mstartfn    func()
 	curg        *g       // current running goroutine
 	caughtsig   guintptr // goroutine running during fatal signal
 	p           puintptr // attached p for executing go code (nil if not executing go code)
 	nextp       puintptr
-	id          int32
+	id          int64
 	mallocing   int32
 	throwing    int32
 	preemptoff  string // if != "", keep curg running on this m
@@ -432,8 +463,11 @@ type m struct {
 	inwb        bool // m is executing a write barrier
 	newSigstack bool // minit on C thread called sigaltstack
 	printlock   int8
-	incgo       bool // m is executing a cgo call
-	fastrand    uint32
+	incgo       bool   // m is executing a cgo call
+	freeWait    uint32 // if == 0, safe to free g0 and delete m (atomic)
+	fastrand    [2]uint32
+	needextram  bool
+	traceback   uint8
 	ncgocall    uint64 // number of cgo calls in total
 	ncgo        int32  // number of cgo calls currently in progress
 	// Not for gccgo: cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
@@ -442,15 +476,14 @@ type m struct {
 	alllink     *m // on allm
 	schedlink   muintptr
 	mcache      *mcache
-	lockedg     *g
+	lockedg     guintptr
 	createstack [32]location // stack that created this thread.
-	// Not for gccgo: freglo        [16]uint32  // d[i] lsb and f[i]
-	// Not for gccgo: freghi        [16]uint32  // d[i] msb and f[i+16]
-	// Not for gccgo: fflag         uint32      // floating point compare flags
-	locked        uint32  // tracking for lockosthread
-	nextwaitm     uintptr // next m waiting for lock
-	needextram    bool
-	traceback     uint8
+	// Not for gccgo: freglo        [16]uint32     // d[i] lsb and f[i]
+	// Not for gccgo: freghi        [16]uint32     // d[i] msb and f[i+16]
+	// Not for gccgo: fflag         uint32         // floating point compare flags
+	lockedExt     uint32         // tracking for external LockOSThread
+	lockedInt     uint32         // tracking for internal lockOSThread
+	nextwaitm     muintptr       // next m waiting for lock
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
 	waitlock      unsafe.Pointer
 	waittraceev   byte
@@ -458,6 +491,7 @@ type m struct {
 	startingtrace bool
 	syscalltick   uint32
 	// Not for gccgo: thread        uintptr // thread handle
+	freelink *m // on sched.freem
 
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
@@ -475,6 +509,7 @@ type m struct {
 	gsignalstacksize uintptr
 
 	dropextram bool // drop after call is done
+	exiting    bool // thread is exiting
 
 	gcing int32
 }
@@ -490,7 +525,7 @@ type p struct {
 	sysmontick  sysmontick // last tick observed by sysmon
 	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
-	// Not for gccgo: racectx     uintptr
+	racectx     uintptr
 
 	// gccgo has only one size of defer.
 	deferpool    []*_defer
@@ -535,26 +570,30 @@ type p struct {
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
-	gcAssistTime     int64 // Nanoseconds in assistAlloc
-	gcBgMarkWorker   guintptr
-	gcMarkWorkerMode gcMarkWorkerMode
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker
+	gcBgMarkWorker       guintptr
+	gcMarkWorkerMode     gcMarkWorkerMode
+
+	// gcMarkWorkerStartTime is the nanotime() at which this mark
+	// worker started.
+	gcMarkWorkerStartTime int64
 
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
 	gcw gcWork
 
+	// wbBuf is this P's GC write barrier buffer.
+	//
+	// TODO: Consider caching this in the running G.
+	wbBuf wbBuf
+
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
 	pad [sys.CacheLineSize]byte
 }
 
-const (
-	// The max value of GOMAXPROCS.
-	// There are no fundamental restrictions on the value.
-	_MaxGomaxprocs = 1 << 10
-)
-
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
 	goidgen  uint64
@@ -562,11 +601,16 @@ type schedt struct {
 
 	lock mutex
 
+	// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
+	// sure to call checkdead().
+
 	midle        muintptr // idle m's waiting for work
 	nmidle       int32    // number of idle m's waiting for work
 	nmidlelocked int32    // number of locked m's waiting for work
-	mcount       int32    // number of m's that have been created
+	mnext        int64    // number of m's that have been created and next M ID
 	maxmcount    int32    // maximum number of m's allowed (or die)
+	nmsys        int32    // number of system m's not counted for deadlock
+	nmfreed      int64    // cumulative number of freed m's
 
 	ngsys uint32 // number of system goroutines; updated atomically
 
@@ -592,6 +636,10 @@ type schedt struct {
 	deferlock mutex
 	deferpool *_defer
 
+	// freem is the list of m's waiting to be freed when their
+	// m.exited is set. Linked through m.freelink.
+	freem *m
+
 	gcwaiting  uint32 // gc is waiting to run
 	stopwait   int32
 	stopnote   note
@@ -610,18 +658,7 @@ type schedt struct {
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
-// The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
-// External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m.locked record the nesting depth of calls to lockOSThread
-// (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
-// Internal locks can be recursive. For instance, a lock for cgo can occur while the main
-// goroutine is holding the lock during the initialization phase.
-const (
-	_LockExternal = 1
-	_LockInternal = 2
-)
-
+// Values for the flags field of a sigTabT.
 const (
 	_SigNotify   = 1 << iota // let signal.Notify have signal, even if from kernel
 	_SigKill                 // if signal.Notify doesn't take it, exit quietly
@@ -630,7 +667,8 @@ const (
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
-	_SigUnblock              // unblocked in minit
+	_SigUnblock              // always unblock; see blockableSig
+	_SigIgn                  // _SIG_DFL action is to ignore the signal
 )
 
 // Lock-free stack node.
@@ -671,8 +709,8 @@ func extendRandom(r []byte, n int) {
 	}
 }
 
-// deferred subroutine calls
-// This is the gccgo version.
+// A _defer holds an entry on the list of deferred calls.
+// If you add a field here, add code to clear it in freedefer.
 type _defer struct {
 	// The next entry in the stack.
 	link *_defer
@@ -743,7 +781,8 @@ const _TracebackMaxFrames = 100
 var (
 	allglen    uintptr
 	allm       *m
-	allp       [_MaxGomaxprocs + 1]*p
+	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
+	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
diff --git a/libgo/go/runtime/runtime_mmap_test.go b/libgo/go/runtime/runtime_mmap_test.go
index 0141e81..c004041 100644
--- a/libgo/go/runtime/runtime_mmap_test.go
+++ b/libgo/go/runtime/runtime_mmap_test.go
@@ -14,17 +14,10 @@ import (
 // what the code in mem_bsd.go, mem_darwin.go, and mem_linux.go expects.
 // See the uses of ENOMEM in sysMap in those files.
 func TestMmapErrorSign(t *testing.T) {
-	p := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	p, err := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
 
-	// The runtime.mmap function is nosplit, but t.Errorf is not.
-	// Reset the pointer so that we don't get an "invalid stack
-	// pointer" error from t.Errorf if we call it.
-	v := uintptr(p)
-	p = nil
-
-	err := runtime.Errno()
-	if v != ^uintptr(0) || err != runtime.ENOMEM {
-		t.Errorf("mmap = %v, %v, want %v", v, err, runtime.ENOMEM)
+	if p != nil || err != runtime.ENOMEM {
+		t.Errorf("mmap = %v, %v, want nil, %v", p, err, runtime.ENOMEM)
 	}
 }
 
@@ -34,20 +27,20 @@ func TestPhysPageSize(t *testing.T) {
 	ps := runtime.GetPhysPageSize()
 
 	// Get a region of memory to play with. This should be page-aligned.
-	b := uintptr(runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0))
-	if b == ^uintptr(0) {
-		t.Fatalf("Mmap: %v %v", b, runtime.Errno())
+	b, err := runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		t.Fatalf("Mmap: %v", err)
 	}
 
 	// Mmap should fail at a half page into the buffer.
-	err := uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err != ^uintptr(0) {
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err == 0 {
 		t.Errorf("Mmap should have failed with half-page alignment %d, but succeeded: %v", ps/2, err)
 	}
 
 	// Mmap should succeed at a full page into the buffer.
-	err = uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err == ^uintptr(0) {
-		t.Errorf("Mmap at full-page alignment %d failed: %v %v", ps, err, runtime.Errno())
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err != 0 {
+		t.Errorf("Mmap at full-page alignment %d failed: %v", ps, err)
 	}
 }
diff --git a/libgo/go/runtime/runtime_test.go b/libgo/go/runtime/runtime_test.go
index b8f6ac2..0231043 100644
--- a/libgo/go/runtime/runtime_test.go
+++ b/libgo/go/runtime/runtime_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"flag"
 	"io"
 	. "runtime"
 	"runtime/debug"
@@ -13,6 +14,8 @@ import (
 	"unsafe"
 )
 
+var flagQuick = flag.Bool("quick", false, "skip slow tests, for second run in all.bash")
+
 func init() {
 	// We're testing the runtime, so make tracebacks show things
 	// in the runtime. This only raises the level, so it won't
@@ -196,9 +199,9 @@ func eqstring_generic(s1, s2 string) bool {
 }
 
 func TestEqString(t *testing.T) {
-	// This isn't really an exhaustive test of eqstring, it's
+	// This isn't really an exhaustive test of == on strings, it's
 	// just a convenient way of documenting (via eqstring_generic)
-	// what eqstring does.
+	// what == does.
 	s := []string{
 		"",
 		"a",
@@ -213,7 +216,7 @@ func TestEqString(t *testing.T) {
 			x := s1 == s2
 			y := eqstring_generic(s1, s2)
 			if x != y {
-				t.Errorf(`eqstring("%s","%s") = %t, want %t`, s1, s2, x, y)
+				t.Errorf(`("%s" == "%s") = %t, want %t`, s1, s2, x, y)
 			}
 		}
 	}
diff --git a/libgo/go/runtime/rwmutex_test.go b/libgo/go/runtime/rwmutex_test.go
index a69eca1..872b3b0 100644
--- a/libgo/go/runtime/rwmutex_test.go
+++ b/libgo/go/runtime/rwmutex_test.go
@@ -12,6 +12,7 @@ package runtime_test
 import (
 	"fmt"
 	. "runtime"
+	"runtime/debug"
 	"sync/atomic"
 	"testing"
 )
@@ -47,6 +48,10 @@ func doTestParallelReaders(numReaders int) {
 
 func TestParallelRWMutexReaders(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(-1))
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted.
+	// Disable GC for this test (see issue #10958).
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 	doTestParallelReaders(1)
 	doTestParallelReaders(3)
 	doTestParallelReaders(4)
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index 9f8ac49..096af52 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -88,7 +88,7 @@ func newselect(sel *hselect, selsize int64, size int32) {
 }
 
 func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectsend: too many cases")
@@ -109,7 +109,7 @@ func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
 }
 
 func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectrecv: too many cases")
@@ -131,7 +131,7 @@ func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
 }
 
 func selectdefault(sel *hselect) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectdefault: too many cases")
@@ -301,7 +301,6 @@ func selectgo(sel *hselect) int {
 
 	var (
 		gp     *g
-		done   uint32
 		sg     *sudog
 		c      *hchan
 		k      *scase
@@ -368,7 +367,6 @@ loop:
 
 	// pass 2 - enqueue on all chans
 	gp = getg()
-	done = 0
 	if gp.waiting != nil {
 		throw("gp.waiting != nil")
 	}
@@ -382,8 +380,7 @@ loop:
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
-		// Note: selectdone is adjusted for stack copies in stack1.go:adjustsudogs
-		sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
+		sg.isSelect = true
 		// No stack splits between assigning elem and enqueuing
 		// sg on gp.waiting where copystack can find it.
 		sg.elem = cas.elem
@@ -409,62 +406,9 @@ loop:
 	gp.param = nil
 	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
 
-	// While we were asleep, some goroutine came along and completed
-	// one of the cases in the select and woke us up (called ready).
-	// As part of that process, the goroutine did a cas on done above
-	// (aka *sg.selectdone for all queued sg) to win the right to
-	// complete the select. Now done = 1.
-	//
-	// If we copy (grow) our own stack, we will update the
-	// selectdone pointers inside the gp.waiting sudog list to point
-	// at the new stack. Another goroutine attempting to
-	// complete one of our (still linked in) select cases might
-	// see the new selectdone pointer (pointing at the new stack)
-	// before the new stack has real data; if the new stack has done = 0
-	// (before the old values are copied over), the goroutine might
-	// do a cas via sg.selectdone and incorrectly believe that it has
-	// won the right to complete the select, executing a second
-	// communication and attempting to wake us (call ready) again.
-	//
-	// Then things break.
-	//
-	// The best break is that the goroutine doing ready sees the
-	// _Gcopystack status and throws, as in #17007.
-	// A worse break would be for us to continue on, start running real code,
-	// block in a semaphore acquisition (sema.go), and have the other
-	// goroutine wake us up without having really acquired the semaphore.
-	// That would result in the goroutine spuriously running and then
-	// queue up another spurious wakeup when the semaphore really is ready.
-	// In general the situation can cascade until something notices the
-	// problem and causes a crash.
-	//
-	// A stack shrink does not have this problem, because it locks
-	// all the channels that are involved first, blocking out the
-	// possibility of a cas on selectdone.
-	//
-	// A stack growth before gopark above does not have this
-	// problem, because we hold those channel locks (released by
-	// selparkcommit).
-	//
-	// A stack growth after sellock below does not have this
-	// problem, because again we hold those channel locks.
-	//
-	// The only problem is a stack growth during sellock.
-	// To keep that from happening, run sellock on the system stack.
-	//
-	// It might be that we could avoid this if copystack copied the
-	// stack before calling adjustsudogs. In that case,
-	// syncadjustsudogs would need to recopy the tiny part that
-	// it copies today, resulting in a little bit of extra copying.
-	//
-	// An even better fix, not for the week before a release candidate,
-	// would be to put space in every sudog and make selectdone
-	// point at (say) the space in the first sudog.
-
-	systemstack(func() {
-		sellock(scases, lockorder)
-	})
+	sellock(scases, lockorder)
 
+	gp.selectDone = 0
 	sg = (*sudog)(gp.param)
 	gp.param = nil
 
@@ -477,7 +421,7 @@ loop:
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
-		sg1.selectdone = nil
+		sg1.isSelect = false
 		sg1.elem = nil
 		sg1.c = nil
 	}
@@ -528,10 +472,8 @@ loop:
 		print("wait-return: sel=", sel, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
 	}
 
-	if cas.kind == caseRecv {
-		if cas.receivedp != nil {
-			*cas.receivedp = true
-		}
+	if cas.kind == caseRecv && cas.receivedp != nil {
+		*cas.receivedp = true
 	}
 
 	if raceenabled {
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index d04e6f5..6e2beec 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -275,7 +275,10 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
 	// https://en.wikipedia.org/wiki/Treap
 	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
-	s.ticket = fastrand()
+	//
+	// s.ticket compared with zero in couple of places, therefore set lowest bit.
+	// It will not affect treap's quality noticeably.
+	s.ticket = fastrand() | 1
 	s.parent = last
 	*pt = s
 
diff --git a/libgo/go/runtime/signal_gccgo.go b/libgo/go/runtime/signal_gccgo.go
index 056be36..6fe7ba1 100644
--- a/libgo/go/runtime/signal_gccgo.go
+++ b/libgo/go/runtime/signal_gccgo.go
@@ -46,11 +46,6 @@ func kill(pid _pid_t, sig uint32) int32
 //extern setitimer
 func setitimer(which int32, new *_itimerval, old *_itimerval) int32
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 type sigctxt struct {
 	info *_siginfo_t
 	ctxt unsafe.Pointer
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
index 378c68e..c042162 100644
--- a/libgo/go/runtime/signal_sighandler.go
+++ b/libgo/go/runtime/signal_sighandler.go
@@ -92,9 +92,9 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 	}
 
 	print("PC=", hex(sigpc), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
 		print("signal arrived during cgo execution\n")
-		gp = _g_.m.lockedg
+		gp = _g_.m.lockedg.ptr()
 	}
 	print("\n")
 
@@ -111,7 +111,7 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 
 	if docrash {
 		crashing++
-		if crashing < sched.mcount-int32(extraMCount) {
+		if crashing < mcount()-int32(extraMCount) {
 			// There are other m's that need to dump their stacks.
 			// Relay SIGQUIT to the next m by sending it to the current process.
 			// All m's that have already received SIGQUIT have signal masks blocking
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 3237e18..8517148 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -8,7 +8,6 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -16,6 +15,16 @@ import (
 //go:linkname initsig runtime.initsig
 //go:linkname sigtrampgo runtime.sigtrampgo
 
+// sigTabT is the type of an entry in the global sigtable array.
+// sigtable is inherently system dependent, and appears in OS-specific files,
+// but sigTabT is the same for all Unixy systems.
+// The sigtable array is indexed by a system signal number to get the flags
+// and printable name of each signal.
+type sigTabT struct {
+	flags int32
+	name  string
+}
+
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	systemstack(sigpipe)
@@ -275,6 +284,12 @@ func sigpipe() {
 // sigtrampgo is called from the signal handler function, sigtramp,
 // written in assembly code.
 // This is called by the signal handler, and the world may be stopped.
+//
+// It must be nosplit because getg() is still the G that was running
+// (if any) when the signal was delivered, but it's (usually) called
+// on the gsignal stack. Until this switches the G to gsignal, the
+// stack bounds check won't work.
+//
 //go:nosplit
 //go:nowritebarrierrec
 func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
@@ -355,8 +370,9 @@ func sigpanic() {
 //go:nosplit
 //go:nowritebarrierrec
 func dieFromSignal(sig uint32) {
-	setsig(sig, _SIG_DFL)
 	unblocksig(sig)
+	// Mark the signal as unhandled to ensure it is forwarded.
+	atomic.Store(&handlingSig[sig], 0)
 	raise(sig)
 
 	// That should have killed us. On some systems, though, raise
@@ -368,6 +384,22 @@ func dieFromSignal(sig uint32) {
 	osyield()
 	osyield()
 
+	// If that didn't work, try _SIG_DFL.
+	setsig(sig, _SIG_DFL)
+	raise(sig)
+
+	osyield()
+	osyield()
+	osyield()
+
+	// On Darwin we may still fail to die, because raise sends the
+	// signal to the whole process rather than just the current thread,
+	// and osyield just sleeps briefly rather than letting all other
+	// threads run. See issue 20315. Sleep longer.
+	if GOOS == "darwin" {
+		usleep(100)
+	}
+
 	// If we are still somehow running, just exit with the wrong status.
 	exit(2)
 }
@@ -434,7 +466,7 @@ func crash() {
 		// this means the OS X core file will be >128 GB and even on a zippy
 		// workstation can take OS X well over an hour to write (uninterruptible).
 		// Save users from making that mistake.
-		if sys.PtrSize == 8 {
+		if GOARCH == "amd64" {
 			return
 		}
 	}
@@ -463,7 +495,7 @@ func ensureSigM() {
 		var sigBlocked sigset
 		sigfillset(&sigBlocked)
 		for i := range sigtable {
-			if sigtable[i].flags&_SigUnblock != 0 {
+			if !blockableSig(uint32(i)) {
 				sigdelset(&sigBlocked, i)
 			}
 		}
@@ -475,7 +507,7 @@ func ensureSigM() {
 					sigdelset(&sigBlocked, int(sig))
 				}
 			case sig := <-disableSigChan:
-				if sig > 0 {
+				if sig > 0 && blockableSig(sig) {
 					sigaddset(&sigBlocked, int(sig))
 				}
 			}
@@ -536,17 +568,23 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return false
 	}
 	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
+	flags := sigtable[sig].flags
 
-	if !signalsOK {
-		// The only way we can get here is if we are in a
-		// library or archive, we installed a signal handler
-		// at program startup, but the Go runtime has not yet
-		// been initialized.
+	// If we aren't handling the signal, forward it.
+	if atomic.Load(&handlingSig[sig]) == 0 || !signalsOK {
+		// If the signal is ignored, doing nothing is the same as forwarding.
+		if fwdFn == _SIG_IGN || (fwdFn == _SIG_DFL && flags&_SigIgn != 0) {
+			return true
+		}
+		// We are not handling the signal and there is no other handler to forward to.
+		// Crash with the default behavior.
 		if fwdFn == _SIG_DFL {
+			setsig(sig, _SIG_DFL)
 			dieFromSignal(sig)
-		} else {
-			sigfwd(fwdFn, sig, info, ctx)
+			return false
 		}
+
+		sigfwd(fwdFn, sig, info, ctx)
 		return true
 	}
 
@@ -555,18 +593,6 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return false
 	}
 
-	// If we aren't handling the signal, forward it.
-	// Really if we aren't handling the signal, we shouldn't get here,
-	// but on Darwin setsigstack can lead us here because it sets
-	// the sa_tramp field. The sa_tramp field is not returned by
-	// sigaction, so the fix for that is non-obvious.
-	if atomic.Load(&handlingSig[sig]) == 0 {
-		sigfwd(fwdFn, sig, info, ctx)
-		return true
-	}
-
-	flags := sigtable[sig].flags
-
 	c := sigctxt{info, ctx}
 	// Only forward synchronous signals and SIGPIPE.
 	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
@@ -678,7 +704,7 @@ func minitSignalStack() {
 func minitSignalMask() {
 	nmask := getg().m.sigmask
 	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
+		if !blockableSig(uint32(i)) {
 			sigdelset(&nmask, i)
 		}
 	}
@@ -694,3 +720,22 @@ func unminitSignals() {
 		signalstack(nil, 0)
 	}
 }
+
+// blockableSig returns whether sig may be blocked by the signal mask.
+// We never want to block the signals marked _SigUnblock;
+// these are the synchronous signals that turn into a Go panic.
+// In a Go program--not a c-archive/c-shared--we never want to block
+// the signals marked _SigKill or _SigThrow, as otherwise it's possible
+// for all running threads to block them and delay their delivery until
+// we start a new thread. When linked into a C program we let the C code
+// decide on the disposition of those signals.
+func blockableSig(sig uint32) bool {
+	flags := sigtable[sig].flags
+	if flags&_SigUnblock != 0 {
+		return false
+	}
+	if isarchive || islibrary {
+		return true
+	}
+	return flags&(_SigKill|_SigThrow) == 0
+}
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index cd036ce..b108c39 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -45,13 +45,14 @@ import (
 // as there is no connection between handling a signal and receiving one,
 // but atomic instructions should minimize it.
 var sig struct {
-	note    note
-	mask    [(_NSIG + 31) / 32]uint32
-	wanted  [(_NSIG + 31) / 32]uint32
-	ignored [(_NSIG + 31) / 32]uint32
-	recv    [(_NSIG + 31) / 32]uint32
-	state   uint32
-	inuse   bool
+	note       note
+	mask       [(_NSIG + 31) / 32]uint32
+	wanted     [(_NSIG + 31) / 32]uint32
+	ignored    [(_NSIG + 31) / 32]uint32
+	recv       [(_NSIG + 31) / 32]uint32
+	state      uint32
+	delivering uint32
+	inuse      bool
 }
 
 const (
@@ -60,15 +61,20 @@ const (
 	sigSending
 )
 
-// Called from sighandler to send a signal back out of the signal handling thread.
-// Reports whether the signal was sent. If not, the caller typically crashes the program.
+// sigsend delivers a signal from sighandler to the internal signal delivery queue.
+// It reports whether the signal was sent. If not, the caller typically crashes the program.
+// It runs from the signal handler, so it's limited in what it can do.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
 	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
 		return false
 	}
 
+	atomic.Xadd(&sig.delivering, 1)
+	// We are running in the signal handler; defer is not available.
+
 	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
+		atomic.Xadd(&sig.delivering, -1)
 		return false
 	}
 
@@ -76,6 +82,7 @@ func sigsend(s uint32) bool {
 	for {
 		mask := sig.mask[s/32]
 		if mask&bit != 0 {
+			atomic.Xadd(&sig.delivering, -1)
 			return true // signal already in queue
 		}
 		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
@@ -104,6 +111,7 @@ Send:
 		}
 	}
 
+	atomic.Xadd(&sig.delivering, -1)
 	return true
 }
 
@@ -155,6 +163,15 @@ func signal_recv() uint32 {
 // by the os/signal package.
 //go:linkname signalWaitUntilIdle os_signal.signalWaitUntilIdle
 func signalWaitUntilIdle() {
+	// Although the signals we care about have been removed from
+	// sig.wanted, it is possible that another thread has received
+	// a signal, has read from sig.wanted, is now updating sig.mask,
+	// and has not yet woken up the processor thread. We need to wait
+	// until all current signal deliveries have completed.
+	for atomic.Load(&sig.delivering) != 0 {
+		Gosched()
+	}
+
 	// Although WaitUntilIdle seems like the right name for this
 	// function, the state we are looking for is sigReceiving, not
 	// sigIdle.  The sigIdle state is really more like sigProcessing.
diff --git a/libgo/go/runtime/sizeclasses.go b/libgo/go/runtime/sizeclasses.go
index 5366564..9e17b00 100644
--- a/libgo/go/runtime/sizeclasses.go
+++ b/libgo/go/runtime/sizeclasses.go
@@ -3,73 +3,73 @@
 
 package runtime
 
-// class  bytes/obj  bytes/span  objects  waste bytes
-//     1          8        8192     1024            0
-//     2         16        8192      512            0
-//     3         32        8192      256            0
-//     4         48        8192      170           32
-//     5         64        8192      128            0
-//     6         80        8192      102           32
-//     7         96        8192       85           32
-//     8        112        8192       73           16
-//     9        128        8192       64            0
-//    10        144        8192       56          128
-//    11        160        8192       51           32
-//    12        176        8192       46           96
-//    13        192        8192       42          128
-//    14        208        8192       39           80
-//    15        224        8192       36          128
-//    16        240        8192       34           32
-//    17        256        8192       32            0
-//    18        288        8192       28          128
-//    19        320        8192       25          192
-//    20        352        8192       23           96
-//    21        384        8192       21          128
-//    22        416        8192       19          288
-//    23        448        8192       18          128
-//    24        480        8192       17           32
-//    25        512        8192       16            0
-//    26        576        8192       14          128
-//    27        640        8192       12          512
-//    28        704        8192       11          448
-//    29        768        8192       10          512
-//    30        896        8192        9          128
-//    31       1024        8192        8            0
-//    32       1152        8192        7          128
-//    33       1280        8192        6          512
-//    34       1408       16384       11          896
-//    35       1536        8192        5          512
-//    36       1792       16384        9          256
-//    37       2048        8192        4            0
-//    38       2304       16384        7          256
-//    39       2688        8192        3          128
-//    40       3072       24576        8            0
-//    41       3200       16384        5          384
-//    42       3456       24576        7          384
-//    43       4096        8192        2            0
-//    44       4864       24576        5          256
-//    45       5376       16384        3          256
-//    46       6144       24576        4            0
-//    47       6528       32768        5          128
-//    48       6784       40960        6          256
-//    49       6912       49152        7          768
-//    50       8192        8192        1            0
-//    51       9472       57344        6          512
-//    52       9728       49152        5          512
-//    53      10240       40960        4            0
-//    54      10880       32768        3          128
-//    55      12288       24576        2            0
-//    56      13568       40960        3          256
-//    57      14336       57344        4            0
-//    58      16384       16384        1            0
-//    59      18432       73728        4            0
-//    60      19072       57344        3          128
-//    61      20480       40960        2            0
-//    62      21760       65536        3          256
-//    63      24576       24576        1            0
-//    64      27264       81920        3          128
-//    65      28672       57344        2            0
-//    66      32768       32768        1            0
+// class  bytes/obj  bytes/span  objects  tail waste  max waste
+//     1          8        8192     1024           0     87.50%
+//     2         16        8192      512           0     43.75%
+//     3         32        8192      256           0     46.88%
+//     4         48        8192      170          32     31.52%
+//     5         64        8192      128           0     23.44%
+//     6         80        8192      102          32     19.07%
+//     7         96        8192       85          32     15.95%
+//     8        112        8192       73          16     13.56%
+//     9        128        8192       64           0     11.72%
+//    10        144        8192       56         128     11.82%
+//    11        160        8192       51          32      9.73%
+//    12        176        8192       46          96      9.59%
+//    13        192        8192       42         128      9.25%
+//    14        208        8192       39          80      8.12%
+//    15        224        8192       36         128      8.15%
+//    16        240        8192       34          32      6.62%
+//    17        256        8192       32           0      5.86%
+//    18        288        8192       28         128     12.16%
+//    19        320        8192       25         192     11.80%
+//    20        352        8192       23          96      9.88%
+//    21        384        8192       21         128      9.51%
+//    22        416        8192       19         288     10.71%
+//    23        448        8192       18         128      8.37%
+//    24        480        8192       17          32      6.82%
+//    25        512        8192       16           0      6.05%
+//    26        576        8192       14         128     12.33%
+//    27        640        8192       12         512     15.48%
+//    28        704        8192       11         448     13.93%
+//    29        768        8192       10         512     13.94%
+//    30        896        8192        9         128     15.52%
+//    31       1024        8192        8           0     12.40%
+//    32       1152        8192        7         128     12.41%
+//    33       1280        8192        6         512     15.55%
+//    34       1408       16384       11         896     14.00%
+//    35       1536        8192        5         512     14.00%
+//    36       1792       16384        9         256     15.57%
+//    37       2048        8192        4           0     12.45%
+//    38       2304       16384        7         256     12.46%
+//    39       2688        8192        3         128     15.59%
+//    40       3072       24576        8           0     12.47%
+//    41       3200       16384        5         384      6.22%
+//    42       3456       24576        7         384      8.83%
+//    43       4096        8192        2           0     15.60%
+//    44       4864       24576        5         256     16.65%
+//    45       5376       16384        3         256     10.92%
+//    46       6144       24576        4           0     12.48%
+//    47       6528       32768        5         128      6.23%
+//    48       6784       40960        6         256      4.36%
+//    49       6912       49152        7         768      3.37%
+//    50       8192        8192        1           0     15.61%
+//    51       9472       57344        6         512     14.28%
+//    52       9728       49152        5         512      3.64%
+//    53      10240       40960        4           0      4.99%
+//    54      10880       32768        3         128      6.24%
+//    55      12288       24576        2           0     11.45%
+//    56      13568       40960        3         256      9.99%
+//    57      14336       57344        4           0      5.35%
+//    58      16384       16384        1           0     12.49%
+//    59      18432       73728        4           0     11.11%
+//    60      19072       57344        3         128      3.57%
+//    61      20480       40960        2           0      6.87%
+//    62      21760       65536        3         256      6.25%
+//    63      24576       24576        1           0     11.45%
+//    64      27264       81920        3         128     10.00%
+//    65      28672       57344        2           0      4.91%
+//    66      32768       32768        1           0     12.50%
 
 const (
 	_MaxSmallSize   = 32768
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index f61f85e..ec5aa64 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -23,6 +23,13 @@ type slice struct {
 	cap   int
 }
 
+// An notInHeapSlice is a slice backed by go:notinheap memory.
+type notInHeapSlice struct {
+	array *notInHeap
+	len   int
+	cap   int
+}
+
 // maxElems is a lookup table containing the maximum capacity for a slice.
 // The index is the size of the slice element.
 var maxElems = [...]uintptr{
@@ -85,7 +92,7 @@ func makeslice64(et *_type, len64, cap64 int64) slice {
 // The new slice's length is set to the requested capacity.
 func growslice(et *_type, old slice, cap int) slice {
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&et))
+		callerpc := getcallerpc()
 		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
 	}
 	if msanenabled {
@@ -109,12 +116,20 @@ func growslice(et *_type, old slice, cap int) slice {
 		if old.len < 1024 {
 			newcap = doublecap
 		} else {
-			for newcap < cap {
+			// Check 0 < newcap to detect overflow
+			// and prevent an infinite loop.
+			for 0 < newcap && newcap < cap {
 				newcap += newcap / 4
 			}
+			// Set newcap to the requested cap when
+			// the newcap calculation overflowed.
+			if newcap <= 0 {
+				newcap = cap
+			}
 		}
 	}
 
+	var overflow bool
 	var lenmem, newlenmem, capmem uintptr
 	const ptrSize = unsafe.Sizeof((*byte)(nil))
 	switch et.size {
@@ -122,20 +137,37 @@ func growslice(et *_type, old slice, cap int) slice {
 		lenmem = uintptr(old.len)
 		newlenmem = uintptr(cap)
 		capmem = roundupsize(uintptr(newcap))
+		overflow = uintptr(newcap) > _MaxMem
 		newcap = int(capmem)
 	case ptrSize:
 		lenmem = uintptr(old.len) * ptrSize
 		newlenmem = uintptr(cap) * ptrSize
 		capmem = roundupsize(uintptr(newcap) * ptrSize)
+		overflow = uintptr(newcap) > _MaxMem/ptrSize
 		newcap = int(capmem / ptrSize)
 	default:
 		lenmem = uintptr(old.len) * et.size
 		newlenmem = uintptr(cap) * et.size
 		capmem = roundupsize(uintptr(newcap) * et.size)
+		overflow = uintptr(newcap) > maxSliceCap(et.size)
 		newcap = int(capmem / et.size)
 	}
 
-	if cap < old.cap || uintptr(newcap) > maxSliceCap(et.size) {
+	// The check of overflow (uintptr(newcap) > maxSliceCap(et.size))
+	// in addition to capmem > _MaxMem is needed to prevent an overflow
+	// which can be used to trigger a segfault on 32bit architectures
+	// with this example program:
+	//
+	// type T [1<<27 + 1]int64
+	//
+	// var d T
+	// var s []T
+	//
+	// func main() {
+	//   s = append(s, d, d, d, d)
+	//   print(len(s), "\n")
+	// }
+	if cap < old.cap || overflow || capmem > _MaxMem {
 		panic(errorString("growslice: cap out of range"))
 	}
 
@@ -176,7 +208,7 @@ func slicecopy(to, fm slice, width uintptr) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
 		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
@@ -207,7 +239,7 @@ func slicestringcopy(to []byte, fm string) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicestringcopy)
 		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
 	}
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 7436ddf..e8df9a6 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -99,7 +99,7 @@ func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
 	if raceenabled {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicebytetostring))
 	}
 	if msanenabled {
@@ -145,7 +145,7 @@ func slicebytetostringtmp(b []byte) string {
 	if raceenabled && len(b) > 0 {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(len(b)),
-			getcallerpc(unsafe.Pointer(&b)),
+			getcallerpc(),
 			funcPC(slicebytetostringtmp))
 	}
 	if msanenabled && len(b) > 0 {
@@ -194,7 +194,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string {
 	if raceenabled && len(a) > 0 {
 		racereadrangepc(unsafe.Pointer(&a[0]),
 			uintptr(len(a))*unsafe.Sizeof(a[0]),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicerunetostring))
 	}
 	if msanenabled && len(a) > 0 {
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 84fa1c7..c454356 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -107,16 +107,21 @@ func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 func memcmp(a, b unsafe.Pointer, size uintptr) int32
 
 // exported value for testing
-var hashLoad = loadFactor
+var hashLoad = float32(loadFactorNum) / float32(loadFactorDen)
 
 //go:nosplit
 func fastrand() uint32 {
 	mp := getg().m
-	fr := mp.fastrand
-	mx := uint32(int32(fr)>>31) & 0xa8888eef
-	fr = fr<<1 ^ mx
-	mp.fastrand = fr
-	return fr
+	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
+	// Shift triplet [17,7,16] was calculated as indicated in Marsaglia's
+	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+	// This generator passes the SmallCrush suite, part of TestU01 framework:
+	// http://simul.iro.umontreal.ca/testu01/tu01.html
+	s1, s0 := mp.fastrand[0], mp.fastrand[1]
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	mp.fastrand[0], mp.fastrand[1] = s0, s1
+	return s0 + s1
 }
 
 //go:nosplit
@@ -192,14 +197,16 @@ func publicationBarrier()
 
 // getcallerpc returns the program counter (PC) of its caller's caller.
 // getcallersp returns the stack pointer (SP) of its caller's caller.
-// For both, the argp must be a pointer to the caller's first function argument.
+// argp must be a pointer to the caller's first function argument.
 // The implementation may or may not use argp, depending on
-// the architecture.
+// the architecture. The implementation may be a compiler
+// intrinsic; there is not necessarily code implementing this
+// on every platform.
 //
 // For example:
 //
 //	func f(arg1, arg2, arg3 int) {
-//		pc := getcallerpc(unsafe.Pointer(&arg1))
+//		pc := getcallerpc()
 //		sp := getcallersp(unsafe.Pointer(&arg1))
 //	}
 //
@@ -219,7 +226,7 @@ func publicationBarrier()
 // immediately and can only be passed to nosplit functions.
 
 //go:noescape
-func getcallerpc(argp unsafe.Pointer) uintptr
+func getcallerpc() uintptr
 
 //go:noescape
 func getcallersp(argp unsafe.Pointer) uintptr
@@ -430,7 +437,7 @@ func setpagesize(s uintptr) {
 	}
 }
 
-// Temporary for gccgo until we port mgc.go.
+// Called by C code during library initialization.
 //go:linkname runtime_m0 runtime.runtime_m0
 func runtime_m0() *m {
 	return &m0
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index 490405d..e760772 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -23,3 +23,10 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
+
+// exitThread terminates the current thread, writing *wait = 0 when
+// the stack is safe to reclaim.
+func exitThread(wait *uint32) {
+	// This is never used by gccgo.
+	throw("exitThread")
+}
diff --git a/libgo/go/runtime/testdata/testprog/gc.go b/libgo/go/runtime/testdata/testprog/gc.go
index 744b610..5424517 100644
--- a/libgo/go/runtime/testdata/testprog/gc.go
+++ b/libgo/go/runtime/testdata/testprog/gc.go
@@ -25,6 +25,7 @@ func GCSys() {
 	runtime.GC()
 	runtime.ReadMemStats(memstats)
 	sys := memstats.Sys
+	fmt.Printf("original sys: %#x\n", sys)
 
 	runtime.MemProfileRate = 0 // disable profiler
 
@@ -36,6 +37,8 @@ func GCSys() {
 	// Should only be using a few MB.
 	// We allocated 100 MB or (if not short) 1 GB.
 	runtime.ReadMemStats(memstats)
+	fmt.Printf("final sys: %#x\n", memstats.Sys)
+	fmt.Printf("%#v\n", *memstats)
 	if sys > memstats.Sys {
 		sys = 0
 	} else {
diff --git a/libgo/go/runtime/testdata/testprog/gettid.go b/libgo/go/runtime/testdata/testprog/gettid.go
new file mode 100644
index 0000000..1b3e29a
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/gettid.go
@@ -0,0 +1,29 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+)
+
+func gettid() int {
+	return syscall.Gettid()
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	if os.IsNotExist(err) {
+		return false, true
+	}
+	// Check if it's a zombie thread.
+	state := bytes.Fields(stat)[2]
+	return !(len(state) == 1 && state[0] == 'Z'), true
+}
diff --git a/libgo/go/runtime/testdata/testprog/gettid_none.go b/libgo/go/runtime/testdata/testprog/gettid_none.go
new file mode 100644
index 0000000..036db87
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/gettid_none.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !linux
+
+package main
+
+func gettid() int {
+	return 0
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	return false, false
+}
diff --git a/libgo/go/runtime/testdata/testprog/lockosthread.go b/libgo/go/runtime/testdata/testprog/lockosthread.go
new file mode 100644
index 0000000..88c0d12
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/lockosthread.go
@@ -0,0 +1,94 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"time"
+)
+
+var mainTID int
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainTID = gettid()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// gettid only works on Linux, so on other platforms this just
+	// checks that the runtime doesn't do anything terrible.
+
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		if mainTID != 0 && gettid() != mainTID {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	if mainTID != 0 && gettid() == mainTID {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subTID int
+	ready := make(chan bool, 1)
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subTID = gettid()
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	runtime.UnlockOSThread()
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		if subTID != 0 && gettid() == subTID {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		exists, supported := tidExists(subTID)
+		if !supported || !exists {
+			goto ok
+		}
+	}
+	println("sub thread", subTID, "still running")
+	return
+ok:
+	println("OK")
+}
diff --git a/libgo/go/runtime/testdata/testprog/syscall_windows.go b/libgo/go/runtime/testdata/testprog/syscall_windows.go
index 6e6782e..b4b6644 100644
--- a/libgo/go/runtime/testdata/testprog/syscall_windows.go
+++ b/libgo/go/runtime/testdata/testprog/syscall_windows.go
@@ -4,11 +4,18 @@
 
 package main
 
-import "syscall"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
 
 func init() {
 	register("RaiseException", RaiseException)
 	register("ZeroDivisionException", ZeroDivisionException)
+	register("StackMemory", StackMemory)
 }
 
 func RaiseException() {
@@ -25,3 +32,39 @@ func ZeroDivisionException() {
 	z := x / y
 	println(z)
 }
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/callback.go b/libgo/go/runtime/testdata/testprogcgo/callback.go
index a49fc19..2f7568c 100644
--- a/libgo/go/runtime/testdata/testprogcgo/callback.go
+++ b/libgo/go/runtime/testdata/testprogcgo/callback.go
@@ -34,6 +34,7 @@ import "C"
 
 import (
 	"fmt"
+	"os"
 	"runtime"
 )
 
@@ -68,7 +69,10 @@ func grow1(x, sum *int) int {
 }
 
 func CgoCallbackGC() {
-	const P = 100
+	P := 100
+	if os.Getenv("RUNTIME_TESTING_SHORT") != "" {
+		P = 10
+	}
 	done := make(chan bool)
 	// allocate a bunch of stack frames and spray them with pointers
 	for i := 0; i < P; i++ {
diff --git a/libgo/go/runtime/testdata/testprogcgo/catchpanic.go b/libgo/go/runtime/testdata/testprogcgo/catchpanic.go
new file mode 100644
index 0000000..55a606d
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/catchpanic.go
@@ -0,0 +1,46 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void abrthandler(int signum) {
+	if (signum == SIGABRT) {
+		exit(0);  // success
+	}
+}
+
+void registerAbortHandler() {
+	struct sigaction act;
+	memset(&act, 0, sizeof act);
+	act.sa_handler = abrthandler;
+	sigaction(SIGABRT, &act, NULL);
+}
+
+static void __attribute__ ((constructor)) sigsetup(void) {
+	if (getenv("CGOCATCHPANIC_EARLY_HANDLER") == NULL)
+		return;
+	registerAbortHandler();
+}
+*/
+import "C"
+import "os"
+
+func init() {
+	register("CgoCatchPanic", CgoCatchPanic)
+}
+
+// Test that the SIGABRT raised by panic can be caught by an early signal handler.
+func CgoCatchPanic() {
+	if _, ok := os.LookupEnv("CGOCATCHPANIC_EARLY_HANDLER"); !ok {
+		C.registerAbortHandler()
+	}
+	panic("catch me")
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/cgo.go b/libgo/go/runtime/testdata/testprogcgo/cgo.go
index 209524a..a587db3 100644
--- a/libgo/go/runtime/testdata/testprogcgo/cgo.go
+++ b/libgo/go/runtime/testdata/testprogcgo/cgo.go
@@ -52,7 +52,11 @@ func CgoSignalDeadlock() {
 	time.Sleep(time.Millisecond)
 	start := time.Now()
 	var times []time.Duration
-	for i := 0; i < 64; i++ {
+	n := 64
+	if os.Getenv("RUNTIME_TEST_SHORT") != "" {
+		n = 16
+	}
+	for i := 0; i < n; i++ {
 		go func() {
 			runtime.LockOSThread()
 			select {}
diff --git a/libgo/go/runtime/testdata/testprogcgo/lockosthread.c b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
new file mode 100644
index 0000000..b10cc4f
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
@@ -0,0 +1,13 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+#include <stdint.h>
+
+uint32_t threadExited;
+
+void setExited(void *x) {
+	__sync_fetch_and_add(&threadExited, 1);
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/lockosthread.go b/libgo/go/runtime/testdata/testprogcgo/lockosthread.go
new file mode 100644
index 0000000..36423d9
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/lockosthread.go
@@ -0,0 +1,111 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+/*
+#include <pthread.h>
+#include <stdint.h>
+
+extern uint32_t threadExited;
+
+void setExited(void *x);
+*/
+import "C"
+
+var mainThread C.pthread_t
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainThread = C.pthread_self()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		self := C.pthread_self()
+		if C.pthread_equal(mainThread, self) == 0 {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	self := C.pthread_self()
+	if C.pthread_equal(mainThread, self) != 0 {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subThread C.pthread_t
+	ready := make(chan bool, 1)
+	C.threadExited = 0
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subThread = C.pthread_self()
+		// Register a pthread destructor so we can tell this
+		// thread has exited.
+		var key C.pthread_key_t
+		C.pthread_key_create(&key, (*[0]byte)(unsafe.Pointer(C.setExited)))
+		C.pthread_setspecific(key, unsafe.Pointer(new(int)))
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		self := C.pthread_self()
+		if C.pthread_equal(subThread, self) != 0 {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		if atomic.LoadUint32((*uint32)(&C.threadExited)) != 0 {
+			println("OK")
+			return
+		}
+	}
+	println("sub thread still running")
+	os.Exit(1)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/sigstack.go b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
new file mode 100644
index 0000000..e30a559
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
@@ -0,0 +1,95 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+// Test handling of Go-allocated signal stacks when calling from
+// C-created threads with and without signal stacks. (See issue
+// #22930.)
+
+package main
+
+/*
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#ifndef MAP_STACK
+#define MAP_STACK 0
+#endif
+
+extern void SigStackCallback();
+
+static void* WithSigStack(void* arg __attribute__((unused))) {
+	// Set up an alternate system stack.
+	void* base = mmap(0, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_STACK, -1, 0);
+	if (base == MAP_FAILED) {
+		perror("mmap failed");
+		abort();
+	}
+	stack_t st = {}, ost = {};
+	st.ss_sp = (char*)base;
+	st.ss_flags = 0;
+	st.ss_size = SIGSTKSZ;
+	if (sigaltstack(&st, &ost) < 0) {
+		perror("sigaltstack failed");
+		abort();
+	}
+
+	// Call Go.
+	SigStackCallback();
+
+	// Disable signal stack and protect it so we can detect reuse.
+	if (ost.ss_flags & SS_DISABLE) {
+		// Darwin libsystem has a bug where it checks ss_size
+		// even if SS_DISABLE is set. (The kernel gets it right.)
+		ost.ss_size = SIGSTKSZ;
+	}
+	if (sigaltstack(&ost, NULL) < 0) {
+		perror("sigaltstack restore failed");
+		abort();
+	}
+	mprotect(base, SIGSTKSZ, PROT_NONE);
+	return NULL;
+}
+
+static void* WithoutSigStack(void* arg __attribute__((unused))) {
+	SigStackCallback();
+	return NULL;
+}
+
+static void DoThread(int sigstack) {
+	pthread_t tid;
+	if (sigstack) {
+		pthread_create(&tid, NULL, WithSigStack, NULL);
+	} else {
+		pthread_create(&tid, NULL, WithoutSigStack, NULL);
+	}
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+func init() {
+	register("SigStack", SigStack)
+}
+
+func SigStack() {
+	C.DoThread(0)
+	C.DoThread(1)
+	C.DoThread(0)
+	C.DoThread(1)
+	println("OK")
+}
+
+var BadPtr *int
+
+//export SigStackCallback
+func SigStackCallback() {
+	// Cause the Go signal handler to run.
+	defer func() { recover() }()
+	*BadPtr = 42
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/stack_windows.go b/libgo/go/runtime/testdata/testprogcgo/stack_windows.go
new file mode 100644
index 0000000..846297a
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/stack_windows.go
@@ -0,0 +1,54 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "C"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
+
+func init() {
+	register("StackMemory", StackMemory)
+}
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index f204830..93181fd 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -6,14 +6,18 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Package time knows the layout of this structure.
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
 // For GOOS=nacl, package syscall knows the layout of this structure.
 // If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
 type timer struct {
-	i int // heap index
+	tb *timersBucket // the bucket the timer lives in
+	i  int           // heap index
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
@@ -25,7 +29,37 @@ type timer struct {
 	seq    uintptr
 }
 
-var timers struct {
+// timersLen is the length of timers array.
+//
+// Ideally, this would be set to GOMAXPROCS, but that would require
+// dynamic reallocation
+//
+// The current value is a compromise between memory usage and performance
+// that should cover the majority of GOMAXPROCS values used in the wild.
+const timersLen = 64
+
+// timers contains "per-P" timer heaps.
+//
+// Timers are queued into timersBucket associated with the current P,
+// so each P may work with its own timers independently of other P instances.
+//
+// Each timersBucket may be associated with multiple P
+// if GOMAXPROCS > timersLen.
+var timers [timersLen]struct {
+	timersBucket
+
+	// The padding should eliminate false sharing
+	// between timersBucket values.
+	pad [sys.CacheLineSize - unsafe.Sizeof(timersBucket{})%sys.CacheLineSize]byte
+}
+
+func (t *timer) assignBucket() *timersBucket {
+	id := uint8(getg().m.p.ptr().id) % timersLen
+	t.tb = &timers[id].timersBucket
+	return t.tb
+}
+
+type timersBucket struct {
 	lock         mutex
 	gp           *g
 	created      bool
@@ -51,18 +85,20 @@ func timeSleep(ns int64) {
 		return
 	}
 
-	t := getg().timer
+	gp := getg()
+	t := gp.timer
 	if t == nil {
 		t = new(timer)
-		getg().timer = t
+		gp.timer = t
 	}
 	*t = timer{}
 	t.when = nanotime() + ns
 	t.f = goroutineReady
-	t.arg = getg()
-	lock(&timers.lock)
-	addtimerLocked(t)
-	goparkunlock(&timers.lock, "sleep", traceEvGoSleep, 2)
+	t.arg = gp
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	goparkunlock(&tb.lock, "sleep", traceEvGoSleep, 2)
 }
 
 // startTimer adds t to the timer heap.
@@ -89,90 +125,98 @@ func goroutineReady(arg interface{}, seq uintptr) {
 }
 
 func addtimer(t *timer) {
-	lock(&timers.lock)
-	addtimerLocked(t)
-	unlock(&timers.lock)
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	unlock(&tb.lock)
 }
 
 // Add a timer to the heap and start or kick timerproc if the new timer is
 // earlier than any of the others.
 // Timers are locked.
-func addtimerLocked(t *timer) {
+func (tb *timersBucket) addtimerLocked(t *timer) {
 	// when must never be negative; otherwise timerproc will overflow
 	// during its delta calculation and never expire other runtime timers.
 	if t.when < 0 {
 		t.when = 1<<63 - 1
 	}
-	t.i = len(timers.t)
-	timers.t = append(timers.t, t)
-	siftupTimer(t.i)
+	t.i = len(tb.t)
+	tb.t = append(tb.t, t)
+	siftupTimer(tb.t, t.i)
 	if t.i == 0 {
 		// siftup moved to top: new earliest deadline.
-		if timers.sleeping {
-			timers.sleeping = false
-			notewakeup(&timers.waitnote)
+		if tb.sleeping {
+			tb.sleeping = false
+			notewakeup(&tb.waitnote)
 		}
-		if timers.rescheduling {
-			timers.rescheduling = false
-			goready(timers.gp, 0)
+		if tb.rescheduling {
+			tb.rescheduling = false
+			goready(tb.gp, 0)
 		}
 	}
-	if !timers.created {
-		timers.created = true
+	if !tb.created {
+		tb.created = true
 		expectSystemGoroutine()
-		go timerproc()
+		go timerproc(tb)
 	}
 }
 
 // Delete timer t from the heap.
 // Do not need to update the timerproc: if it wakes up early, no big deal.
 func deltimer(t *timer) bool {
-	// Dereference t so that any panic happens before the lock is held.
-	// Discard result, because t might be moving in the heap.
-	_ = t.i
+	if t.tb == nil {
+		// t.tb can be nil if the user created a timer
+		// directly, without invoking startTimer e.g
+		//    time.Ticker{C: c}
+		// In this case, return early without any deletion.
+		// See Issue 21874.
+		return false
+	}
 
-	lock(&timers.lock)
+	tb := t.tb
+
+	lock(&tb.lock)
 	// t may not be registered anymore and may have
 	// a bogus i (typically 0, if generated by Go).
 	// Verify it before proceeding.
 	i := t.i
-	last := len(timers.t) - 1
-	if i < 0 || i > last || timers.t[i] != t {
-		unlock(&timers.lock)
+	last := len(tb.t) - 1
+	if i < 0 || i > last || tb.t[i] != t {
+		unlock(&tb.lock)
 		return false
 	}
 	if i != last {
-		timers.t[i] = timers.t[last]
-		timers.t[i].i = i
+		tb.t[i] = tb.t[last]
+		tb.t[i].i = i
 	}
-	timers.t[last] = nil
-	timers.t = timers.t[:last]
+	tb.t[last] = nil
+	tb.t = tb.t[:last]
 	if i != last {
-		siftupTimer(i)
-		siftdownTimer(i)
+		siftupTimer(tb.t, i)
+		siftdownTimer(tb.t, i)
 	}
-	unlock(&timers.lock)
+	unlock(&tb.lock)
 	return true
 }
 
 // Timerproc runs the time-driven events.
-// It sleeps until the next event in the timers heap.
+// It sleeps until the next event in the tb heap.
 // If addtimer inserts a new earlier event, it wakes timerproc early.
-func timerproc() {
+func timerproc(tb *timersBucket) {
 	setSystemGoroutine()
 
-	timers.gp = getg()
+	tb.gp = getg()
 	for {
-		lock(&timers.lock)
-		timers.sleeping = false
+		lock(&tb.lock)
+		tb.sleeping = false
 		now := nanotime()
 		delta := int64(-1)
 		for {
-			if len(timers.t) == 0 {
+			if len(tb.t) == 0 {
 				delta = -1
 				break
 			}
-			t := timers.t[0]
+			t := tb.t[0]
 			delta = t.when - now
 			if delta > 0 {
 				break
@@ -180,43 +224,43 @@ func timerproc() {
 			if t.period > 0 {
 				// leave in heap but adjust next time to fire
 				t.when += t.period * (1 + -delta/t.period)
-				siftdownTimer(0)
+				siftdownTimer(tb.t, 0)
 			} else {
 				// remove from heap
-				last := len(timers.t) - 1
+				last := len(tb.t) - 1
 				if last > 0 {
-					timers.t[0] = timers.t[last]
-					timers.t[0].i = 0
+					tb.t[0] = tb.t[last]
+					tb.t[0].i = 0
 				}
-				timers.t[last] = nil
-				timers.t = timers.t[:last]
+				tb.t[last] = nil
+				tb.t = tb.t[:last]
 				if last > 0 {
-					siftdownTimer(0)
+					siftdownTimer(tb.t, 0)
 				}
 				t.i = -1 // mark as removed
 			}
 			f := t.f
 			arg := t.arg
 			seq := t.seq
-			unlock(&timers.lock)
+			unlock(&tb.lock)
 			if raceenabled {
 				raceacquire(unsafe.Pointer(t))
 			}
 			f(arg, seq)
-			lock(&timers.lock)
+			lock(&tb.lock)
 		}
 		if delta < 0 || faketime > 0 {
 			// No timers left - put goroutine to sleep.
-			timers.rescheduling = true
-			goparkunlock(&timers.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
+			tb.rescheduling = true
+			goparkunlock(&tb.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
 			continue
 		}
 		// At least one timer pending. Sleep until then.
-		timers.sleeping = true
-		timers.sleepUntil = now + delta
-		noteclear(&timers.waitnote)
-		unlock(&timers.lock)
-		notetsleepg(&timers.waitnote, delta)
+		tb.sleeping = true
+		tb.sleepUntil = now + delta
+		noteclear(&tb.waitnote)
+		unlock(&tb.lock)
+		notetsleepg(&tb.waitnote, delta)
 	}
 }
 
@@ -225,28 +269,67 @@ func timejump() *g {
 		return nil
 	}
 
-	lock(&timers.lock)
-	if !timers.created || len(timers.t) == 0 {
-		unlock(&timers.lock)
+	for i := range timers {
+		lock(&timers[i].lock)
+	}
+	gp := timejumpLocked()
+	for i := range timers {
+		unlock(&timers[i].lock)
+	}
+
+	return gp
+}
+
+func timejumpLocked() *g {
+	// Determine a timer bucket with minimum when.
+	var minT *timer
+	for i := range timers {
+		tb := &timers[i]
+		if !tb.created || len(tb.t) == 0 {
+			continue
+		}
+		t := tb.t[0]
+		if minT == nil || t.when < minT.when {
+			minT = t
+		}
+	}
+	if minT == nil || minT.when <= faketime {
+		return nil
+	}
+
+	faketime = minT.when
+	tb := minT.tb
+	if !tb.rescheduling {
 		return nil
 	}
+	tb.rescheduling = false
+	return tb.gp
+}
+
+func timeSleepUntil() int64 {
+	next := int64(1<<63 - 1)
 
-	var gp *g
-	if faketime < timers.t[0].when {
-		faketime = timers.t[0].when
-		if timers.rescheduling {
-			timers.rescheduling = false
-			gp = timers.gp
+	// Determine minimum sleepUntil across all the timer buckets.
+	//
+	// The function can not return a precise answer,
+	// as another timer may pop in as soon as timers have been unlocked.
+	// So lock the timers one by one instead of all at once.
+	for i := range timers {
+		tb := &timers[i]
+
+		lock(&tb.lock)
+		if tb.sleeping && tb.sleepUntil < next {
+			next = tb.sleepUntil
 		}
+		unlock(&tb.lock)
 	}
-	unlock(&timers.lock)
-	return gp
+
+	return next
 }
 
 // Heap maintenance algorithms.
 
-func siftupTimer(i int) {
-	t := timers.t
+func siftupTimer(t []*timer, i int) {
 	when := t[i].when
 	tmp := t[i]
 	for i > 0 {
@@ -256,14 +339,15 @@ func siftupTimer(i int) {
 		}
 		t[i] = t[p]
 		t[i].i = i
-		t[p] = tmp
-		t[p].i = p
 		i = p
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
-func siftdownTimer(i int) {
-	t := timers.t
+func siftdownTimer(t []*timer, i int) {
 	n := len(t)
 	when := t[i].when
 	tmp := t[i]
@@ -294,10 +378,12 @@ func siftdownTimer(i int) {
 		}
 		t[i] = t[c]
 		t[i].i = i
-		t[c] = tmp
-		t[c].i = c
 		i = c
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
 // Entry points for net, time to call nanotime.
@@ -312,4 +398,10 @@ func time_runtimeNano() int64 {
 	return nanotime()
 }
 
-var startNano int64 = nanotime()
+// Monotonic times are reported as offsets from startNano.
+// We initialize startNano to nanotime() - 1 so that on systems where
+// monotonic time resolution is fairly low (e.g. Windows 2008
+// which appears to have a default resolution of 15ms),
+// we avoid ever reporting a nanotime of 0.
+// (Callers may want to use 0 as "time not set".)
+var startNano int64 = nanotime() - 1
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index af9313b..8427e76 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -28,8 +28,8 @@ const (
 	traceEvProcStop          = 6  // stop of P [timestamp]
 	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone            = 8  // GC done [timestamp]
-	traceEvGCScanStart       = 9  // GC mark termination start [timestamp]
-	traceEvGCScanDone        = 10 // GC mark termination done [timestamp]
+	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
+	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
 	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
 	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
@@ -235,21 +235,21 @@ func StartTrace() error {
 	trace.timeStart = nanotime()
 	trace.headerWritten = false
 	trace.footerWritten = false
-	trace.strings = make(map[string]uint64)
+
+	// string to id mapping
+	//  0 : reserved for an empty string
+	//  remaining: other strings registered by traceString
 	trace.stringSeq = 0
+	trace.strings = make(map[string]uint64)
+
 	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
 	// Register runtime goroutine labels.
 	_, pid, bufp := traceAcquireBuffer()
-	buf := (*bufp).ptr()
-	if buf == nil {
-		buf = traceFlush(0).ptr()
-		(*bufp).set(buf)
-	}
 	for i, label := range gcMarkWorkerModeStrings[:] {
-		trace.markWorkerLabels[i], buf = traceString(buf, label)
+		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
 	}
 	traceReleaseBuffer(pid)
 
@@ -277,10 +277,9 @@ func StopTrace() {
 
 	traceGoSched()
 
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	// Loop over all allocated Ps because dead Ps may still have
+	// trace buffers.
+	for _, p := range allp[:cap(allp)] {
 		buf := p.tracebuf
 		if buf != 0 {
 			traceFullQueue(buf)
@@ -320,10 +319,7 @@ func StopTrace() {
 
 	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
 	lock(&trace.lock)
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp[:cap(allp)] {
 		if p.tracebuf != 0 {
 			throw("trace: non-empty trace buffer in proc")
 		}
@@ -382,7 +378,7 @@ func ReadTrace() []byte {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.9 trace\x00\x00\x00\x00")
+		return []byte("go 1.10 trace\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -408,9 +404,12 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		if timers.gp != nil {
-			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
-			data = traceAppend(data, uint64(timers.gp.goid))
+		for i := range timers {
+			tb := &timers[i]
+			if tb.gp != nil {
+				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
+				data = traceAppend(data, uint64(tb.gp.goid))
+			}
 		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
@@ -514,18 +513,12 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 	buf := (*bufp).ptr()
 	const maxSize = 2 + 5*traceBytesPerNumber // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
 		(*bufp).set(buf)
 	}
 
 	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
-	if buf.pos == 0 {
-		buf.byte(traceEvBatch | 1<<traceArgCountShift)
-		buf.varint(uint64(pid))
-		buf.varint(ticks)
-		tickDiff = 0
-	}
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -602,7 +595,7 @@ func traceReleaseBuffer(pid int32) {
 }
 
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
-func traceFlush(buf traceBufPtr) traceBufPtr {
+func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
 	if dolock {
@@ -623,34 +616,51 @@ func traceFlush(buf traceBufPtr) traceBufPtr {
 	bufp := buf.ptr()
 	bufp.link.set(nil)
 	bufp.pos = 0
-	bufp.lastTicks = 0
+
+	// initialize the buffer for a new batch
+	ticks := uint64(cputicks()) / traceTickDiv
+	bufp.lastTicks = ticks
+	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
+	bufp.varint(uint64(pid))
+	bufp.varint(ticks)
+
 	if dolock {
 		unlock(&trace.lock)
 	}
 	return buf
 }
 
-func traceString(buf *traceBuf, s string) (uint64, *traceBuf) {
+// traceString adds a string to the trace.strings and returns the id.
+func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
 	if s == "" {
-		return 0, buf
+		return 0, bufp
 	}
 	if id, ok := trace.strings[s]; ok {
-		return id, buf
+		return id, bufp
 	}
 
 	trace.stringSeq++
 	id := trace.stringSeq
 	trace.strings[s] = id
 
+	// memory allocation in above may trigger tracing and
+	// cause *bufp changes. Following code now works with *bufp,
+	// so there must be no memory allocation or any activities
+	// that causes tracing after this point.
+
+	buf := (*bufp).ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
-	if len(buf.arr)-buf.pos < size {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+	if buf == nil || len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		(*bufp).set(buf)
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
 	buf.varint(uint64(len(s)))
 	buf.pos += copy(buf.arr[buf.pos:], s)
-	return id, buf
+
+	(*bufp).set(buf)
+	return id, bufp
 }
 
 // traceAppend appends v to buf in little-endian-base-128 encoding.
@@ -772,7 +782,7 @@ func (tab *traceStackTable) newStack(n int) *traceStack {
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
 	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
-	buf := traceFlush(0).ptr()
+	bufp := traceFlush(0, 0)
 	for _, stk := range tab.tab {
 		stk := stk.ptr()
 		for ; stk != nil; stk = stk.link.ptr() {
@@ -782,7 +792,7 @@ func (tab *traceStackTable) dump() {
 			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
 			for _, f := range frames {
 				var frame traceFrame
-				frame, buf = traceFrameForPC(buf, f)
+				frame, bufp = traceFrameForPC(bufp, 0, f)
 				tmpbuf = traceAppend(tmpbuf, uint64(f.pc))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
@@ -790,9 +800,10 @@ func (tab *traceStackTable) dump() {
 			}
 			// Now copy to the buffer.
 			size := 1 + traceBytesPerNumber + len(tmpbuf)
-			if len(buf.arr)-buf.pos < size {
-				buf = traceFlush(traceBufPtrOf(buf)).ptr()
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
+				bufp = traceFlush(bufp, 0)
 			}
+			buf := bufp.ptr()
 			buf.byte(traceEvStack | 3<<traceArgCountShift)
 			buf.varint(uint64(len(tmpbuf)))
 			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
@@ -800,7 +811,7 @@ func (tab *traceStackTable) dump() {
 	}
 
 	lock(&trace.lock)
-	traceFullQueue(traceBufPtrOf(buf))
+	traceFullQueue(bufp)
 	unlock(&trace.lock)
 
 	tab.mem.drop()
@@ -813,7 +824,10 @@ type traceFrame struct {
 	line   uint64
 }
 
-func traceFrameForPC(buf *traceBuf, f location) (traceFrame, *traceBuf) {
+// traceFrameForPC records the frame information.
+// It may allocate memory.
+func traceFrameForPC(buf traceBufPtr, pid int32, f location) (traceFrame, traceBufPtr) {
+	bufp := &buf
 	var frame traceFrame
 
 	fn := f.function
@@ -821,14 +835,14 @@ func traceFrameForPC(buf *traceBuf, f location) (traceFrame, *traceBuf) {
 	if len(fn) > maxLen {
 		fn = fn[len(fn)-maxLen:]
 	}
-	frame.funcID, buf = traceString(buf, fn)
+	frame.funcID, bufp = traceString(bufp, pid, fn)
 	frame.line = uint64(f.lineno)
 	file := f.filename
 	if len(file) > maxLen {
 		file = file[len(file)-maxLen:]
 	}
-	frame.fileID, buf = traceString(buf, file)
-	return frame, buf
+	frame.fileID, bufp = traceString(bufp, pid, file)
+	return frame, (*bufp)
 }
 
 // traceAlloc is a non-thread-safe region allocator.
@@ -917,12 +931,12 @@ func traceGCDone() {
 	traceEvent(traceEvGCDone, -1)
 }
 
-func traceGCScanStart() {
-	traceEvent(traceEvGCScanStart, -1)
+func traceGCSTWStart(kind int) {
+	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
 }
 
-func traceGCScanDone() {
-	traceEvent(traceEvGCScanDone, -1)
+func traceGCSTWDone() {
+	traceEvent(traceEvGCSTWDone, -1)
 }
 
 // traceGCSweepStart prepares to trace a sweep loop. This does not
diff --git a/libgo/go/runtime/trace/example_test.go b/libgo/go/runtime/trace/example_test.go
new file mode 100644
index 0000000..8e0ee5a
--- /dev/null
+++ b/libgo/go/runtime/trace/example_test.go
@@ -0,0 +1,41 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package trace_test
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"runtime/trace"
+)
+
+// Example demonstrates the use of the trace package to trace
+// the execution of a Go program. The trace output will be
+// written to the file trace.out
+func Example() {
+	f, err := os.Create("trace.out")
+	if err != nil {
+		log.Fatalf("failed to create trace output file: %v", err)
+	}
+	defer func() {
+		if err := f.Close(); err != nil {
+			log.Fatalf("failed to close trace file: %v", err)
+		}
+	}()
+
+	if err := trace.Start(f); err != nil {
+		log.Fatalf("failed to start trace: %v", err)
+	}
+	defer trace.Stop()
+
+	// your program here
+	RunMyProgram()
+}
+
+func RunMyProgram() {
+	fmt.Printf("this function will be traced")
+}
diff --git a/libgo/go/runtime/trace/trace.go b/libgo/go/runtime/trace/trace.go
index 7cbb8a6..439f998 100644
--- a/libgo/go/runtime/trace/trace.go
+++ b/libgo/go/runtime/trace/trace.go
@@ -2,13 +2,36 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Go execution tracer.
-// The tracer captures a wide range of execution events like goroutine
-// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
-// changes of heap size, processor start/stop, etc and writes them to an io.Writer
-// in a compact form. A precise nanosecond-precision timestamp and a stack
-// trace is captured for most events. A trace can be analyzed later with
-// 'go tool trace' command.
+// Package trace contains facilities for programs to generate trace
+// for Go execution tracer.
+//
+// The execution trace captures a wide range of execution events such as
+// goroutine creation/blocking/unblocking, syscall enter/exit/block,
+// GC-related events, changes of heap size, processor start/stop, etc.
+// A precise nanosecond-precision timestamp and a stack trace is
+// captured for most events. The generated trace can be interpreted
+// using `go tool trace`.
+//
+// Tracing a Go program
+//
+// Support for tracing tests and benchmarks built with the standard
+// testing package is built into `go test`. For example, the following
+// command runs the test in the current directory and writes the trace
+// file (trace.out).
+//
+//    go test -trace=test.out
+//
+// This runtime/trace package provides APIs to add equivalent tracing
+// support to a standalone program. See the Example that demonstrates
+// how to use this API to enable tracing.
+//
+// There is also a standard HTTP interface to profiling data. Adding the
+// following line will install handlers under the /debug/pprof/trace URL
+// to download live profiles:
+//
+//     import _ "net/http/pprof"
+//
+// See the net/http/pprof package for more details.
 package trace
 
 import (
diff --git a/libgo/go/runtime/trace/trace_test.go b/libgo/go/runtime/trace/trace_test.go
index c5f64fc..5fa5b82 100644
--- a/libgo/go/runtime/trace/trace_test.go
+++ b/libgo/go/runtime/trace/trace_test.go
@@ -7,6 +7,7 @@ package trace_test
 import (
 	"bytes"
 	"flag"
+	"internal/race"
 	"internal/trace"
 	"io"
 	"io/ioutil"
@@ -14,6 +15,7 @@ import (
 	"os"
 	"runtime"
 	. "runtime/trace"
+	"strconv"
 	"sync"
 	"testing"
 	"time"
@@ -23,6 +25,61 @@ var (
 	saveTraces = flag.Bool("savetraces", false, "save traces collected by tests")
 )
 
+// TestEventBatch tests Flush calls that happen during Start
+// don't produce corrupted traces.
+func TestEventBatch(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping in race mode")
+	}
+	if testing.Short() {
+		t.Skip("skipping in short mode")
+	}
+	// During Start, bunch of records are written to reflect the current
+	// snapshot of the program, including state of each goroutines.
+	// And some string constants are written to the trace to aid trace
+	// parsing. This test checks Flush of the buffer occurred during
+	// this process doesn't cause corrupted traces.
+	// When a Flush is called during Start is complicated
+	// so we test with a range of number of goroutines hoping that one
+	// of them triggers Flush.
+	// This range was chosen to fill up a ~64KB buffer with traceEvGoCreate
+	// and traceEvGoWaiting events (12~13bytes per goroutine).
+	for g := 4950; g < 5050; g++ {
+		n := g
+		t.Run("G="+strconv.Itoa(n), func(t *testing.T) {
+			var wg sync.WaitGroup
+			wg.Add(n)
+
+			in := make(chan bool, 1000)
+			for i := 0; i < n; i++ {
+				go func() {
+					<-in
+					wg.Done()
+				}()
+			}
+			buf := new(bytes.Buffer)
+			if err := Start(buf); err != nil {
+				t.Fatalf("failed to start tracing: %v", err)
+			}
+
+			for i := 0; i < n; i++ {
+				in <- true
+			}
+			wg.Wait()
+			Stop()
+
+			_, err := trace.Parse(buf, "")
+			if err == trace.ErrTimeOrder {
+				t.Skipf("skipping trace: %v", err)
+			}
+
+			if err != nil {
+				t.Fatalf("failed to parse trace: %v", err)
+			}
+		})
+	}
+}
+
 func TestTraceStartStop(t *testing.T) {
 	buf := new(bytes.Buffer)
 	if err := Start(buf); err != nil {
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 37c5698..79f78d8 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -52,7 +52,7 @@ func c_callers(skip int32, locbuf *location, max int32, keepThunks bool) int32
 // callers returns a stack trace of the current goroutine.
 // The gc version of callers takes []uintptr, but we take []location.
 func callers(skip int, locbuf []location) int {
-	n := c_callers(int32(skip), &locbuf[0], int32(len(locbuf)), false)
+	n := c_callers(int32(skip)+1, &locbuf[0], int32(len(locbuf)), false)
 	return int(n)
 }
 
@@ -156,7 +156,7 @@ func goroutineheader(gp *g) {
 	if waitfor >= 1 {
 		print(", ", waitfor, " minutes")
 	}
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		print(", locked to thread")
 	}
 	print("]:\n")
-- 
cgit v1.1


From e4876be5f5c5524ea742527100e36c5095181b28 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Wed, 10 Jan 2018 05:15:52 +0000
Subject: runtime: noescape some functions/variables

    This is in preparation of turning on escape analysis for the
    runtime.

    - In gccgo, systemstack is implemented with mcall, which is not
      go:noescape. Wrap the closure in noescape so the escape analysis
      does not think it escapes.

    - Mark some C functions go:noescape. They do not leak arguments.

    - Use noescape function to make a few local variables' addresses
      not escape. The escape analysis cannot figure out because they
      are assigned to pointer indirections.

    Reviewed-on: https://go-review.googlesource.com/86244

From-SVN: r256418
---
 libgo/go/runtime/panic.go           | 10 +++++-----
 libgo/go/runtime/proc.go            |  4 ++++
 libgo/go/runtime/signal_gccgo.go    |  8 ++++++++
 libgo/go/runtime/stubs.go           |  9 +++++++--
 libgo/go/runtime/traceback_gccgo.go |  7 ++++---
 5 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 5cc325f..b2deb6e 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -201,7 +201,7 @@ func deferreturn(frame *bool) {
 			// The gc compiler does this using assembler
 			// code in jmpdefer.
 			var fn func(unsafe.Pointer)
-			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(unsafe.Pointer(&pfn))
+			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
 			fn(d.arg)
 		}
 
@@ -264,7 +264,7 @@ func checkdefer(frame *bool) {
 		var p _panic
 		p.isforeign = true
 		p.link = gp._panic
-		gp._panic = &p
+		gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 		for {
 			d := gp._defer
 			if d == nil || d.frame != frame || d.pfn == 0 {
@@ -275,7 +275,7 @@ func checkdefer(frame *bool) {
 			gp._defer = d.link
 
 			var fn func(unsafe.Pointer)
-			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(unsafe.Pointer(&pfn))
+			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
 			fn(d.arg)
 
 			freedefer(d)
@@ -368,7 +368,7 @@ func Goexit() {
 		d.pfn = 0
 
 		var fn func(unsafe.Pointer)
-		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(unsafe.Pointer(&pfn))
+		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
 		fn(d.arg)
 
 		if gp._defer != d {
@@ -491,7 +491,7 @@ func gopanic(e interface{}) {
 		d._panic = p
 
 		var fn func(unsafe.Pointer)
-		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(unsafe.Pointer(&pfn))
+		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
 		fn(d.arg)
 
 		if gp._defer != d {
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 1ea4152..515efaa 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -46,7 +46,11 @@ import (
 
 // C functions for thread and context management.
 func newosproc(*m)
+
+//go:noescape
 func malg(bool, bool, *unsafe.Pointer, *uintptr) *g
+
+//go:noescape
 func resetNewG(*g, *unsafe.Pointer, *uintptr)
 func gogo(*g)
 func setGContext()
diff --git a/libgo/go/runtime/signal_gccgo.go b/libgo/go/runtime/signal_gccgo.go
index 6fe7ba1..92143ea 100644
--- a/libgo/go/runtime/signal_gccgo.go
+++ b/libgo/go/runtime/signal_gccgo.go
@@ -13,24 +13,31 @@ import (
 // Functions for gccgo to support signal handling. In the gc runtime
 // these are written in OS-specific files and in assembler.
 
+//go:noescape
 //extern sigaction
 func sigaction(signum uint32, act *_sigaction, oact *_sigaction) int32
 
+//go:noescape
 //extern sigprocmask
 func sigprocmask(how int32, set *sigset, oldset *sigset) int32
 
+//go:noescape
 //extern sigfillset
 func sigfillset(set *sigset) int32
 
+//go:noescape
 //extern sigemptyset
 func sigemptyset(set *sigset) int32
 
+//go:noescape
 //extern sigaddset
 func c_sigaddset(set *sigset, signum uint32) int32
 
+//go:noescape
 //extern sigdelset
 func c_sigdelset(set *sigset, signum uint32) int32
 
+//go:noescape
 //extern sigaltstack
 func sigaltstack(ss *_stack_t, oss *_stack_t) int32
 
@@ -43,6 +50,7 @@ func getpid() _pid_t
 //extern kill
 func kill(pid _pid_t, sig uint32) int32
 
+//go:noescape
 //extern setitimer
 func setitimer(which int32, new *_itimerval, old *_itimerval) int32
 
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index c454356..fa3b1ce 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -60,10 +60,11 @@ func systemstack(fn func()) {
 	if gp == mp.g0 || gp == mp.gsignal {
 		fn()
 	} else if gp == mp.curg {
-		mcall(func(origg *g) {
+		fn1 := func(origg *g) {
 			fn()
 			gogo(origg)
-		})
+		}
+		mcall(*(*func(*g))(noescape(unsafe.Pointer(&fn1))))
 	} else {
 		badsystemstack()
 	}
@@ -160,6 +161,7 @@ func breakpoint()
 func asminit() {}
 
 //go:linkname reflectcall reflect.call
+//go:noescape
 func reflectcall(fntype *functype, fn *funcval, isInterface, isMethod bool, params, results *unsafe.Pointer)
 
 func procyield(cycles uint32)
@@ -355,7 +357,10 @@ func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 func getSigtramp() uintptr
 
 // The sa_handler field is generally hidden in a union, so use C accessors.
+//go:noescape
 func getSigactionHandler(*_sigaction) uintptr
+
+//go:noescape
 func setSigactionHandler(*_sigaction, uintptr)
 
 // Retrieve fields from the siginfo_t and ucontext_t pointers passed
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 79f78d8..8551ec1 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -9,7 +9,7 @@ package runtime
 
 import (
 	"runtime/internal/sys"
-	_ "unsafe" // for go:linkname
+	"unsafe"
 )
 
 func printcreatedby(gp *g) {
@@ -46,6 +46,7 @@ type location struct {
 	lineno   int
 }
 
+//go:noescape
 //extern runtime_callers
 func c_callers(skip int32, locbuf *location, max int32, keepThunks bool) int32
 
@@ -185,7 +186,7 @@ func tracebackothers(me *g) {
 	if gp != nil && gp != me {
 		print("\n")
 		goroutineheader(gp)
-		gp.traceback = &tb
+		gp.traceback = (*tracebackg)(noescape(unsafe.Pointer(&tb)))
 		getTraceback(me, gp)
 		printtrace(tb.locbuf[:tb.c], nil)
 		printcreatedby(gp)
@@ -219,7 +220,7 @@ func tracebackothers(me *g) {
 			print("\tgoroutine in C code; stack unavailable\n")
 			printcreatedby(gp)
 		} else {
-			gp.traceback = &tb
+			gp.traceback = (*tracebackg)(noescape(unsafe.Pointer(&tb)))
 			getTraceback(me, gp)
 			printtrace(tb.locbuf[:tb.c], nil)
 			printcreatedby(gp)
-- 
cgit v1.1


From 692aefcd5618a00e622a1c96957d943723040b4c Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Wed, 10 Jan 2018 05:26:29 +0000
Subject: runtime: work around escaping closure in export_test.go

    When compiling runtime, it is not allowed for local variables
    and closures to be heap allocated. In one test, there is a go
    statement with a closure. In the gc compiler, it distinguishes
    capturing variable by value vs. by address, and rewrites it to
    passing the captured values as arguments. Currently we don't
    have this, and the escape analysis decides to heap allocate the
    closure and also the captured variables, which is not allowed.
    Work around it by passing the variables explicitly.

    This is in preparation of turning on escape analysis for the
    runtime.

    Reviewed-on: https://go-review.googlesource.com/86245

From-SVN: r256419
---
 libgo/go/runtime/export_test.go | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index e385f14..5e798e3 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -123,15 +123,15 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 	// can lead to underutilization (both runnable Gs and idle Ps coexist
 	// for arbitrary long time).
 	done := make(chan bool, 1)
-	p := new(p)
+	_p_ := new(p)
 	gs := make([]g, 2)
 	ready := new(uint32)
 	for i := 0; i < iters; i++ {
 		*ready = 0
 		next0 := (i & 1) == 0
 		next1 := (i & 2) == 0
-		runqput(p, &gs[0], next0)
-		go func() {
+		runqput(_p_, &gs[0], next0)
+		go func(done chan bool, p *p, ready *uint32, next0, next1 bool) {
 			for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
 			}
 			if runqempty(p) {
@@ -139,13 +139,13 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 				throw("queue is empty")
 			}
 			done <- true
-		}()
+		}(done, _p_, ready, next0, next1)
 		for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
 		}
-		runqput(p, &gs[1], next1)
-		runqget(p)
+		runqput(_p_, &gs[1], next1)
+		runqget(_p_)
 		<-done
-		runqget(p)
+		runqget(_p_)
 	}
 }
 
-- 
cgit v1.1


From fd0c1dd1678d573052799d106f00ddd5505e5bfa Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Wed, 10 Jan 2018 19:19:02 +0000
Subject: libgo: add platform support for SuperH

    Reviewed-on: https://go-review.googlesource.com/84555

From-SVN: r256446
---
 libgo/go/runtime/hash32.go        | 2 +-
 libgo/go/runtime/lfstack_32bit.go | 2 +-
 libgo/go/runtime/unaligned2.go    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go
index 401fe28..22daec5 100644
--- a/libgo/go/runtime/hash32.go
+++ b/libgo/go/runtime/hash32.go
@@ -6,7 +6,7 @@
 //   xxhash: https://code.google.com/p/xxhash/
 // cityhash: https://code.google.com/p/cityhash/
 
-// +build 386 arm armbe m68k mips mipsle ppc s390 sparc
+// +build 386 arm armbe m68k mips mipsle ppc s390 sh shbe sparc
 
 package runtime
 
diff --git a/libgo/go/runtime/lfstack_32bit.go b/libgo/go/runtime/lfstack_32bit.go
index ab0edab..bbc421a 100644
--- a/libgo/go/runtime/lfstack_32bit.go
+++ b/libgo/go/runtime/lfstack_32bit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm nacl armbe m68k mips mipsle mips64p32 mips64p32le ppc s390 sparc
+// +build 386 arm nacl armbe m68k mips mipsle mips64p32 mips64p32le ppc s390 sh shbe sparc
 
 package runtime
 
diff --git a/libgo/go/runtime/unaligned2.go b/libgo/go/runtime/unaligned2.go
index a33c87a..891459e 100644
--- a/libgo/go/runtime/unaligned2.go
+++ b/libgo/go/runtime/unaligned2.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build arm mips mipsle mips64 mips64le armbe m68k sparc alpha ia64 mips64p32 mips64p32le sparc64
+// +build arm mips mipsle mips64 mips64le armbe m68k sparc alpha ia64 mips64p32 mips64p32le sh shbe sparc64
 
 package runtime
 
-- 
cgit v1.1


From c6d6367f848cfd8381aba41e035c5e7e873667c5 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Wed, 17 Jan 2018 14:20:29 +0000
Subject: libgo: update to Go1.10beta2 release

    Reviewed-on: https://go-review.googlesource.com/87897

From-SVN: r256794
---
 libgo/go/runtime/crash_cgo_test.go                |  2 +-
 libgo/go/runtime/internal/sys/stubs.go            |  2 ++
 libgo/go/runtime/mbitmap.go                       |  3 +++
 libgo/go/runtime/mstats.go                        |  2 +-
 libgo/go/runtime/mwbbuf.go                        | 16 ++++++++++++++--
 libgo/go/runtime/proc.go                          |  8 +++++++-
 libgo/go/runtime/testdata/testprogcgo/sigstack.go |  6 +-----
 libgo/go/runtime/trace/trace_test.go              |  8 ++++----
 8 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index 7e14e57..3532147 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -484,7 +484,7 @@ func TestWindowsStackMemoryCgo(t *testing.T) {
 func TestSigStackSwapping(t *testing.T) {
 	switch runtime.GOOS {
 	case "plan9", "windows":
-		t.Skip("no sigaltstack on %s", runtime.GOOS)
+		t.Skipf("no sigaltstack on %s", runtime.GOOS)
 	}
 	t.Parallel()
 	got := runTestProg(t, "testprogcgo", "SigStack")
diff --git a/libgo/go/runtime/internal/sys/stubs.go b/libgo/go/runtime/internal/sys/stubs.go
index 0a94502..5328023 100644
--- a/libgo/go/runtime/internal/sys/stubs.go
+++ b/libgo/go/runtime/internal/sys/stubs.go
@@ -9,3 +9,5 @@ package sys
 const PtrSize = 4 << (^uintptr(0) >> 63)           // unsafe.Sizeof(uintptr(0)) but an ideal const
 const RegSize = 4 << (^Uintreg(0) >> 63)           // unsafe.Sizeof(uintreg(0)) but an ideal const
 const SpAlign = 1*(1-GoarchArm64) + 16*GoarchArm64 // SP alignment: 1 normally, 16 for ARM64
+
+var DefaultGoroot string // set at link time
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index a775b57..191239a 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -489,6 +489,9 @@ func (h heapBits) forward(n uintptr) heapBits {
 // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
 // The result includes in its higher bits the bits for subsequent words
 // described by the same bitmap byte.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
 func (h heapBits) bits() uint32 {
 	// The (shift & 31) eliminates a test and conditional branch
 	// from the generated code.
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 22f5195..095a0de 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -262,7 +262,7 @@ type MemStats struct {
 	// can only be used for other objects of roughly the same
 	// size.
 	//
-	// HeapInuse minus HeapAlloc esimates the amount of memory
+	// HeapInuse minus HeapAlloc estimates the amount of memory
 	// that has been dedicated to particular size classes, but is
 	// not currently being used. This is an upper bound on
 	// fragmentation, but in general this memory can be reused
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
index a060df8b..7e88463 100644
--- a/libgo/go/runtime/mwbbuf.go
+++ b/libgo/go/runtime/mwbbuf.go
@@ -95,6 +95,15 @@ func (b *wbBuf) reset() {
 	}
 }
 
+// discard resets b's next pointer, but not its end pointer.
+//
+// This must be nosplit because it's called by wbBufFlush.
+//
+//go:nosplit
+func (b *wbBuf) discard() {
+	b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+}
+
 // putFast adds old and new to the write barrier buffer and returns
 // false if a flush is necessary. Callers should use this as:
 //
@@ -143,10 +152,14 @@ func (b *wbBuf) putFast(old, new uintptr) bool {
 //go:nowritebarrierrec
 //go:nosplit
 func wbBufFlush(dst *uintptr, src uintptr) {
+	// Note: Every possible return from this function must reset
+	// the buffer's next pointer to prevent buffer overflow.
+
 	if getg().m.dying > 0 {
 		// We're going down. Not much point in write barriers
 		// and this way we can allow write barriers in the
 		// panic path.
+		getg().m.p.ptr().wbBuf.discard()
 		return
 	}
 
@@ -156,8 +169,7 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 		cgoCheckWriteBarrier(dst, src)
 		if !writeBarrier.needed {
 			// We were only called for cgocheck.
-			b := &getg().m.p.ptr().wbBuf
-			b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+			getg().m.p.ptr().wbBuf.discard()
 			return
 		}
 	}
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 515efaa..2972daa0 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -2220,6 +2220,12 @@ stop:
 		return gp, false
 	}
 
+	// Before we drop our P, make a snapshot of the allp slice,
+	// which can change underfoot once we no longer block
+	// safe-points. We don't need to snapshot the contents because
+	// everything up to cap(allp) is immutable.
+	allpSnapshot := allp
+
 	// return P and block
 	lock(&sched.lock)
 	if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
@@ -2259,7 +2265,7 @@ stop:
 	}
 
 	// check all runqueues once again
-	for _, _p_ := range allp {
+	for _, _p_ := range allpSnapshot {
 		if !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
diff --git a/libgo/go/runtime/testdata/testprogcgo/sigstack.go b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
index e30a559..492dfef 100644
--- a/libgo/go/runtime/testdata/testprogcgo/sigstack.go
+++ b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
@@ -17,15 +17,11 @@ package main
 #include <stdlib.h>
 #include <sys/mman.h>
 
-#ifndef MAP_STACK
-#define MAP_STACK 0
-#endif
-
 extern void SigStackCallback();
 
 static void* WithSigStack(void* arg __attribute__((unused))) {
 	// Set up an alternate system stack.
-	void* base = mmap(0, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_STACK, -1, 0);
+	void* base = mmap(0, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
 	if (base == MAP_FAILED) {
 		perror("mmap failed");
 		abort();
diff --git a/libgo/go/runtime/trace/trace_test.go b/libgo/go/runtime/trace/trace_test.go
index 5fa5b82..997d486 100644
--- a/libgo/go/runtime/trace/trace_test.go
+++ b/libgo/go/runtime/trace/trace_test.go
@@ -127,20 +127,20 @@ func TestTrace(t *testing.T) {
 }
 
 func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc) {
-	events, err := trace.Parse(r, "")
+	res, err := trace.Parse(r, "")
 	if err == trace.ErrTimeOrder {
 		t.Skipf("skipping trace: %v", err)
 	}
 	if err != nil {
 		t.Fatalf("failed to parse trace: %v", err)
 	}
-	gs := trace.GoroutineStats(events)
+	gs := trace.GoroutineStats(res.Events)
 	for goid := range gs {
 		// We don't do any particular checks on the result at the moment.
 		// But still check that RelatedGoroutines does not crash, hang, etc.
-		_ = trace.RelatedGoroutines(events, goid)
+		_ = trace.RelatedGoroutines(res.Events, goid)
 	}
-	return events, gs
+	return res.Events, gs
 }
 
 func testBrokenTimestamps(t *testing.T, data []byte) {
-- 
cgit v1.1


From a9411cce01dce8213c4133d2078c0b69e4d40844 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 18 Jan 2018 04:24:48 +0000
Subject: re PR go/83787 (Many 32-bit Solaris/SPARC Go tests FAIL after
 Go1.10beta1 update)

	PR go/83787
    compiler: pass int to makechan, call makechan64 when appropriate

    The update to 1.10beta1 changed makechan to take int instead of int64,
    and added a makechan64 call for large values.  Since the size is the
    last argument to makechan, the old compiler which always passed a
    64-bit int worked fine on 64-bit systems and little-endian 32-bit
    systems, but broke on big-endian 32-bit systems.  This CL fixes the
    compiler to use the appropriate types.

    This fixes GCC PR 83787.

    Reviewed-on: https://go-review.googlesource.com/88077

From-SVN: r256835
---
 libgo/go/runtime/chan.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index 8db728d..bf708ae 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -26,6 +26,7 @@ import (
 // themselves, so that the compiler will export them.
 //
 //go:linkname makechan runtime.makechan
+//go:linkname makechan64 runtime.makechan64
 //go:linkname chansend1 runtime.chansend1
 //go:linkname chanrecv1 runtime.chanrecv1
 //go:linkname chanrecv2 runtime.chanrecv2
-- 
cgit v1.1


From 549e4febc34ae8ef2ef13f621e7fe69cb73e1aee Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Fri, 19 Jan 2018 04:09:55 +0000
Subject: runtime: add go:noescape declaration for Solaris

    Patch by Rainer Orth.

    Reviewed-on: https://go-review.googlesource.com/88376

From-SVN: r256872
---
 libgo/go/runtime/netpoll_solaris.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/netpoll_solaris.go b/libgo/go/runtime/netpoll_solaris.go
index e1e7385..a960e93 100644
--- a/libgo/go/runtime/netpoll_solaris.go
+++ b/libgo/go/runtime/netpoll_solaris.go
@@ -84,6 +84,7 @@ func port_associate(port, source int32, object uintptr, events uint32, user uint
 //extern port_dissociate
 func port_dissociate(port, source int32, object uintptr) int32
 
+//go:noescape
 //extern port_getn
 func port_getn(port int32, evs *portevent, max uint32, nget *uint32, timeout *timespec) int32
 
-- 
cgit v1.1


From 674dddfe2db0280637f2f71211f9fd2018f555e8 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Fri, 19 Jan 2018 04:48:51 +0000
Subject: runtime: no escape for some functions on AIX

    Reviewed-on: https://go-review.googlesource.com/88236

From-SVN: r256874
---
 libgo/go/runtime/netpoll_aix.go | 3 +++
 libgo/go/runtime/stubs2.go      | 1 +
 2 files changed, 4 insertions(+)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/netpoll_aix.go b/libgo/go/runtime/netpoll_aix.go
index b4962cc..da59f93 100644
--- a/libgo/go/runtime/netpoll_aix.go
+++ b/libgo/go/runtime/netpoll_aix.go
@@ -34,12 +34,15 @@ const _PS_DELETE = 0x2
 //extern pollset_create
 func pollset_create(maxfd int32) pollset_t
 
+//go:noescape
 //extern pollset_ctl
 func pollset_ctl(ps pollset_t, pollctl_array *poll_ctl, array_length int32) int32
 
+//go:noescape
 //extern pollset_poll
 func pollset_poll(ps pollset_t, polldata_array *pollfd, array_length int32, timeout int32) int32
 
+//go:noescape
 //extern pipe
 func libc_pipe(fd *int32) int32
 
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index e760772..e305b16 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -10,6 +10,7 @@ package runtime
 
 import "unsafe"
 
+//go:noescape
 func read(fd int32, p unsafe.Pointer, n int32) int32
 func closefd(fd int32) int32
 
-- 
cgit v1.1


From 4880b994d63ffb014a5a547239e9cd8402bb014a Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Wed, 24 Jan 2018 23:50:09 +0000
Subject: compiler: rationalize external symbol names

    Encode all external symbol names using only ASCII alphanumeric
    characters, underscore, and dot.  Use a scheme that can be reliably
    demangled to a somewhat readable version as described in the long
    comment in names.cc.

    A minor cleanup discovered during this was that we were treating
    function types as different if one had a NULL parameters_ field and
    another has a non-NULL parameters_ field that has no parameters.  This
    worked because we mangled them slightly differently.  We now mangle
    them the same, so we treat them as equal, as we should anyhow.

    Reviewed-on: https://go-review.googlesource.com/89555

	* go.go-torture/execute/names-1.go: New test.

From-SVN: r257033
---
 libgo/go/runtime/crash_test.go       | 4 ++--
 libgo/go/runtime/panic.go            | 2 +-
 libgo/go/runtime/pprof/pprof_test.go | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 8ec0348..06200c7 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -425,7 +425,7 @@ func TestPanicTraceback(t *testing.T) {
 	// Check functions in the traceback.
 	fns := []string{"main.pt1.func1", "panic", "main.pt2.func1", "panic", "main.pt2", "main.pt1"}
 	if runtime.Compiler == "gccgo" {
-		fns = []string{"main.$nested", "panic", "main.$nested", "panic", "main.pt2", "main.pt1"}
+		fns = []string{"main.pt1..func1", "panic", "main.pt2..func1", "panic", "main.pt2", "main.pt1"}
 	}
 	for _, fn := range fns {
 		var re *regexp.Regexp
@@ -570,7 +570,7 @@ func TestPanicInlined(t *testing.T) {
 		buf = buf[:n]
 		want := []byte("(*point).negate(")
 		if runtime.Compiler == "gccgo" {
-			want = []byte("negate.pN18_runtime_test.point")
+			want = []byte("point.negate")
 		}
 		if !bytes.Contains(buf, want) {
 			t.Logf("%s", buf)
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index b2deb6e..24d78ec 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -656,7 +656,7 @@ func canrecover(retaddr uintptr) bool {
 		}
 
 		// Ignore other functions in the reflect package.
-		if hasprefix(name, "reflect.") {
+		if hasprefix(name, "reflect.") || hasprefix(name, ".1reflect.") {
 			continue
 		}
 
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 08a4f96..ae6ec6d 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -730,7 +730,7 @@ func TestMutexProfile(t *testing.T) {
 		stks := stacks(p)
 		for _, want := range [][]string{
 			// {"sync.(*Mutex).Unlock", "pprof.blockMutex.func1"},
-			{"sync.Unlock.pN10_sync.Mutex", "pprof.$nested17"},
+			{".1sync.Mutex.Unlock", "pprof.blockMutex..func1"},
 		} {
 			if !containsStack(stks, want) {
 				t.Errorf("No matching stack entry for %+v", want)
-- 
cgit v1.1


From a88d1f8bb2015c752fc783e168e25e5dab99a8b9 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 25 Jan 2018 17:44:19 +0000
Subject: runtime: fix lfstackUnpack on ia64

    The top three region number bits must be masked out before
    right-shifting the address bits into place, otherwise they will be
    copied down into the lower always-zero address bits.

    Reviewed-on: https://go-review.googlesource.com/84535

From-SVN: r257061
---
 libgo/go/runtime/lfstack_64bit.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/lfstack_64bit.go b/libgo/go/runtime/lfstack_64bit.go
index 44cbf74..95d0eba 100644
--- a/libgo/go/runtime/lfstack_64bit.go
+++ b/libgo/go/runtime/lfstack_64bit.go
@@ -78,7 +78,7 @@ func lfstackUnpack(val uint64) *lfnode {
 		return (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> sparcLinuxCntBits << 3)))
 	}
 	if GOARCH == "ia64" {
-		return (*lfnode)(unsafe.Pointer(uintptr((val>>ia64CntBits<<3)&(1<<(64-3)-1) | val&^(1<<(64-3)-1))))
+		return (*lfnode)(unsafe.Pointer(uintptr(((val & (1<<(64-3) - 1)) >> ia64CntBits << 3) | val&^(1<<(64-3)-1))))
 	}
 	if GOARCH == "ppc64" && GOOS == "aix" {
 		if val&(1<<63) != 0 {
-- 
cgit v1.1


From a14e122ae2ecf27a8cace7e0d1dda8606d5e4eca Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 25 Jan 2018 23:10:35 +0000
Subject: compiler: deref receiver types in mangled names

    This was the original intent, as reflected in the long comment at the
    start of names.cc, but I forgot to implement it.

    Also, remove a leading ".0" from the final name.  That could occur for
    a method whose receiver type starts with 'u', as in that case we
    prepend a space to the mangled name, to avoid confusion with the
    Unicode mangling, and the space turns into ".0".

    Also, if the Unicode encoding would cause the final to start with
    "..u" or "..U", add a leading underscore.

    Patch gotest to not get fooled by some names.

    The result of these changes is that all symbols start with a letter or
    an underscore.

    Reviewed-on: https://go-review.googlesource.com/90015

From-SVN: r257068
---
 libgo/go/runtime/pprof/pprof_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index ae6ec6d..07a7946 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -730,7 +730,7 @@ func TestMutexProfile(t *testing.T) {
 		stks := stacks(p)
 		for _, want := range [][]string{
 			// {"sync.(*Mutex).Unlock", "pprof.blockMutex.func1"},
-			{".1sync.Mutex.Unlock", "pprof.blockMutex..func1"},
+			{"sync.Mutex.Unlock", "pprof.blockMutex..func1"},
 		} {
 			if !containsStack(stks, want) {
 				t.Errorf("No matching stack entry for %+v", want)
-- 
cgit v1.1


From d779dffc4b6ec69cb51b426f779eca4bc37fd063 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Sat, 27 Jan 2018 23:44:29 +0000
Subject: libgo: update to Go1.10rc1

    Reviewed-on: https://go-review.googlesource.com/90295

From-SVN: r257126
---
 libgo/go/runtime/crash_test.go        | 17 +++++++++++++++++
 libgo/go/runtime/error.go             |  3 +--
 libgo/go/runtime/panic.go             | 22 ++++++++++++++--------
 libgo/go/runtime/pprof/pprof.go       | 25 ++++++++++++++++++++-----
 libgo/go/runtime/rwmutex.go           |  2 +-
 libgo/go/runtime/signal_sighandler.go |  5 +++++
 libgo/go/runtime/signal_unix.go       |  6 ++++++
 libgo/go/runtime/time.go              |  1 +
 8 files changed, 65 insertions(+), 16 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 06200c7..7a099be 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -632,3 +632,20 @@ retry:
 	}
 	t.Errorf("test ran %d times without producing expected output", tries)
 }
+
+func TestBadTraceback(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("gccgo does not do a hex dump")
+	}
+	output := runTestProg(t, "testprog", "BadTraceback")
+	for _, want := range []string{
+		"runtime: unexpected return pc",
+		"called from 0xbad",
+		"00000bad",    // Smashed LR in hex dump
+		"<main.badLR", // Symbolization in hex dump (badLR1 or badLR2)
+	} {
+		if !strings.Contains(output, want) {
+			t.Errorf("output does not contain %q:\n%s", want, output)
+		}
+	}
+}
diff --git a/libgo/go/runtime/error.go b/libgo/go/runtime/error.go
index 44e63d8..34b7d37 100644
--- a/libgo/go/runtime/error.go
+++ b/libgo/go/runtime/error.go
@@ -138,8 +138,7 @@ func typestring(x interface{}) string {
 	return *e._type.string
 }
 
-// For calling from C.
-// Prints an argument passed to panic.
+// printany prints an argument passed to panic.
 func printany(i interface{}) {
 	switch v := i.(type) {
 	case nil:
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 24d78ec..fbdb17e 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -403,12 +403,15 @@ func preprintpanics(p *_panic) {
 }
 
 // Print all currently active panics. Used when crashing.
+// Should only be called after preprintpanics.
 func printpanics(p *_panic) {
 	if p.link != nil {
 		printpanics(p.link)
 		print("\t")
 	}
 	print("panic: ")
+	// Because of preprintpanics, p.arg cannot be an error or
+	// stringer, so this won't call into user code.
 	printany(p.arg)
 	if p.recovered {
 		print(" [recovered]")
@@ -833,7 +836,7 @@ var panicking uint32
 // so that two concurrent panics don't overlap their output.
 var paniclk mutex
 
-// startpanic_m implements unrecoverable panic.
+// startpanic_m prepares for an unrecoverable panic.
 //
 // It can have write barriers because the write barrier explicitly
 // ignores writes once dying > 0.
@@ -841,14 +844,14 @@ var paniclk mutex
 //go:yeswritebarrierrec
 func startpanic() {
 	_g_ := getg()
-	// Uncomment when mheap_ is in Go.
-	// if mheap_.cachealloc.size == 0 { // very early
-	//	print("runtime: panic before malloc heap initialized\n")
-	//	_g_.m.mallocing = 1 // tell rest of panic not to try to malloc
-	// } else
-	if _g_.m.mcache == nil { // can happen if called from signal handler or throw
-		_g_.m.mcache = allocmcache()
+	if mheap_.cachealloc.size == 0 { // very early
+		print("runtime: panic before malloc heap initialized\n")
 	}
+	// Disallow malloc during an unrecoverable panic. A panic
+	// could happen in a signal handler, or in a throw, or inside
+	// malloc itself. We want to catch if an allocation ever does
+	// happen (even if we're not in one of these situations).
+	_g_.m.mallocing++
 
 	switch _g_.m.dying {
 	case 0:
@@ -934,6 +937,9 @@ func dopanic(unused int) {
 	exit(2)
 }
 
+// canpanic returns false if a signal should throw instead of
+// panicking.
+//
 //go:nosplit
 func canpanic(gp *g) bool {
 	// Note that g is m->gsignal, different from gp.
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 8a562e2..be4e869 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -350,7 +350,8 @@ type countProfile interface {
 // as the pprof-proto format output. Translations from cycle count to time duration
 // are done because The proto expects count and time (nanoseconds) instead of count
 // and the number of cycles for block, contention profiles.
-func printCountCycleProfile(w io.Writer, countName, cycleName string, records []runtime.BlockProfileRecord) error {
+// Possible 'scaler' functions are scaleBlockProfile and scaleMutexProfile.
+func printCountCycleProfile(w io.Writer, countName, cycleName string, scaler func(int64, float64) (int64, float64), records []runtime.BlockProfileRecord) error {
 	// Output profile in protobuf form.
 	b := newProfileBuilder(w)
 	b.pbValueType(tagProfile_PeriodType, countName, "count")
@@ -363,8 +364,9 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records []
 	values := []int64{0, 0}
 	var locs []uint64
 	for _, r := range records {
-		values[0] = int64(r.Count)
-		values[1] = int64(float64(r.Cycles) / cpuGHz) // to nanoseconds
+		count, nanosec := scaler(r.Count, float64(r.Cycles)/cpuGHz)
+		values[0] = count
+		values[1] = int64(nanosec)
 		locs = locs[:0]
 		for _, addr := range r.Stack() {
 			// For count profiles, all stack addresses are
@@ -820,7 +822,7 @@ func writeBlock(w io.Writer, debug int) error {
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
 	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", p)
+		return printCountCycleProfile(w, "contentions", "delay", scaleBlockProfile, p)
 	}
 
 	b := bufio.NewWriter(w)
@@ -847,6 +849,14 @@ func writeBlock(w io.Writer, debug int) error {
 	return b.Flush()
 }
 
+func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
+	// Do nothing.
+	// The current way of block profile sampling makes it
+	// hard to compute the unsampled number. The legacy block
+	// profile parse doesn't attempt to scale or unsample.
+	return cnt, ns
+}
+
 // writeMutex writes the current mutex profile to w.
 func writeMutex(w io.Writer, debug int) error {
 	// TODO(pjw): too much common code with writeBlock. FIX!
@@ -864,7 +874,7 @@ func writeMutex(w io.Writer, debug int) error {
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
 	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", p)
+		return printCountCycleProfile(w, "contentions", "delay", scaleMutexProfile, p)
 	}
 
 	b := bufio.NewWriter(w)
@@ -892,4 +902,9 @@ func writeMutex(w io.Writer, debug int) error {
 	return b.Flush()
 }
 
+func scaleMutexProfile(cnt int64, ns float64) (int64, float64) {
+	period := runtime.SetMutexProfileFraction(-1)
+	return cnt * int64(period), ns * float64(period)
+}
+
 func runtime_cyclesPerSecond() int64
diff --git a/libgo/go/runtime/rwmutex.go b/libgo/go/runtime/rwmutex.go
index 7eeb559..a6da4c9 100644
--- a/libgo/go/runtime/rwmutex.go
+++ b/libgo/go/runtime/rwmutex.go
@@ -10,7 +10,7 @@ import (
 
 // This is a copy of sync/rwmutex.go rewritten to work in the runtime.
 
-// An rwmutex is a reader/writer mutual exclusion lock.
+// A rwmutex is a reader/writer mutual exclusion lock.
 // The lock can be held by an arbitrary number of readers or a single writer.
 // This is a variant of sync.RWMutex, for the runtime package.
 // Like mutex, rwmutex blocks the calling M.
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
index c042162..698629d 100644
--- a/libgo/go/runtime/signal_sighandler.go
+++ b/libgo/go/runtime/signal_sighandler.go
@@ -40,6 +40,11 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 	if sig < uint32(len(sigtable)) {
 		flags = sigtable[sig].flags
 	}
+	if flags&_SigPanic != 0 && gp.throwsplit {
+		// We can't safely sigpanic because it may grow the
+		// stack. Abort in the signal handler instead.
+		flags = (flags &^ _SigPanic) | _SigThrow
+	}
 	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
 		// Emulate gc by passing arguments out of band,
 		// although we don't really have to.
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 8517148..02d5348 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -320,6 +320,12 @@ func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
 // the signal handler. The effect is that the program will act as
 // though the function that got the signal simply called sigpanic
 // instead.
+//
+// This must NOT be nosplit because the linker doesn't know where
+// sigpanic calls can be injected.
+//
+// The signal handler must not inject a call to sigpanic if
+// getg().throwsplit, since sigpanic may need to grow the stack.
 func sigpanic() {
 	g := getg()
 	if !canpanic(g) {
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index 93181fd..b707590 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -59,6 +59,7 @@ func (t *timer) assignBucket() *timersBucket {
 	return t.tb
 }
 
+//go:notinheap
 type timersBucket struct {
 	lock         mutex
 	gp           *g
-- 
cgit v1.1


From f6acbd0805c24bb68ca2c88be26e770995cb4fa2 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Sat, 27 Jan 2018 23:45:46 +0000
Subject: libgo: update to Go1.10rc1

    Reviewed-on: https://go-review.googlesource.com/90295

From-SVN: r257127
---
 libgo/go/runtime/testdata/testprog/badtraceback.go | 47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 libgo/go/runtime/testdata/testprog/badtraceback.go

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/testdata/testprog/badtraceback.go b/libgo/go/runtime/testdata/testprog/badtraceback.go
new file mode 100644
index 0000000..d558adc
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/badtraceback.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"runtime"
+	"runtime/debug"
+	"unsafe"
+)
+
+func init() {
+	register("BadTraceback", BadTraceback)
+}
+
+func BadTraceback() {
+	// Disable GC to prevent traceback at unexpected time.
+	debug.SetGCPercent(-1)
+
+	// Run badLR1 on its own stack to minimize the stack size and
+	// exercise the stack bounds logic in the hex dump.
+	go badLR1()
+	select {}
+}
+
+//go:noinline
+func badLR1() {
+	// We need two frames on LR machines because we'll smash this
+	// frame's saved LR.
+	badLR2(0)
+}
+
+//go:noinline
+func badLR2(arg int) {
+	// Smash the return PC or saved LR.
+	lrOff := unsafe.Sizeof(uintptr(0))
+	if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
+		lrOff = 32 // FIXED_FRAME or sys.MinFrameSize
+	}
+	lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff))
+	*lrPtr = 0xbad
+
+	// Print a backtrace. This should include diagnostics for the
+	// bad return PC and a hex dump.
+	panic("backtrace")
+}
-- 
cgit v1.1


From 38f08ec0bd9bd39c578c2e9283a9aa00dc6bda15 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Fri, 2 Feb 2018 00:16:43 +0000
Subject: runtime: scan register backing store on ia64

    On ia64, a separate stack is used for saving/restoring register frames,
    occupying the other end of the stack mapping. This must also be scanned
    for pointers into the heap.

    Reviewed-on: https://go-review.googlesource.com/85276

From-SVN: r257323
---
 libgo/go/runtime/runtime2.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 543086d..0299d5a 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -409,11 +409,15 @@ type g struct {
 	// gcnextsegment: unused
 	// gcnextsp: current SP while executing a syscall
 	// gcinitialsp: g0: top of stack; others: start of stack memory
+	// gcnextsp2: current secondary stack pointer (if present)
+	// gcinitialsp2: start of secondary stack (if present)
 	gcstack       uintptr
 	gcstacksize   uintptr
 	gcnextsegment uintptr
 	gcnextsp      uintptr
 	gcinitialsp   unsafe.Pointer
+	gcnextsp2     uintptr
+	gcinitialsp2  unsafe.Pointer
 
 	// gcregs holds the register values while executing a syscall.
 	// This is set by getcontext and scanned by the garbage collector.
-- 
cgit v1.1


From c88893a0da8fd01f20b4e254f706f054a72fc435 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Tue, 6 Feb 2018 15:18:50 +0000
Subject: runtime: correct runtime structfield type to match reflect

    The offset field in structfield has changed to offsetAnon, and now
    requires a shift to get the actual offset value.

    Fixes golang/go#23391

    Reviewed-on: https://go-review.googlesource.com/92275

From-SVN: r257413
---
 libgo/go/runtime/cgocall.go |  2 +-
 libgo/go/runtime/type.go    | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 9d16120..d5794bc 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -189,7 +189,7 @@ func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
 			return
 		}
 		for _, f := range st.fields {
-			cgoCheckArg(f.typ, add(p, f.offset), true, top, msg)
+			cgoCheckArg(f.typ, add(p, f.offset()), true, top, msg)
 		}
 	case kindPtr, kindUnsafePointer:
 		if indir {
diff --git a/libgo/go/runtime/type.go b/libgo/go/runtime/type.go
index 6788f24..0ec0da4 100644
--- a/libgo/go/runtime/type.go
+++ b/libgo/go/runtime/type.go
@@ -113,11 +113,19 @@ type ptrtype struct {
 }
 
 type structfield struct {
-	name    *string // nil for embedded fields
-	pkgPath *string // nil for exported Names; otherwise import path
-	typ     *_type  // type of field
-	tag     *string // nil if no tag
-	offset  uintptr // byte offset of field within struct
+	name       *string // nil for embedded fields
+	pkgPath    *string // nil for exported Names; otherwise import path
+	typ        *_type  // type of field
+	tag        *string // nil if no tag
+	offsetAnon uintptr // byte offset of field<<1 | isAnonymous
+}
+
+func (f *structfield) offset() uintptr {
+	return f.offsetAnon >> 1
+}
+
+func (f *structfield) anon() bool {
+	return f.offsetAnon&1 != 0
 }
 
 type structtype struct {
-- 
cgit v1.1


From f1a2d8b1b51ee77499ccf8ed3b4b49ca5307ddb4 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Wed, 7 Feb 2018 22:04:55 +0000
Subject: runtime: don't call funcPC from a function

    The escape analysis support is not yet good enough to avoid escaping
    the argument to funcPC.  This causes unnecessary and often harmful
    memory allocation.  E.g., (*cpuProfile).addExtra can be called from a
    signal handler, and it must not allocate memory.

    Move the calls to funcPC to use variables instead.  This was done in
    the original migration to using funcPC, but was not done for newer code.

    In one case, in signal handling code, use getSigtramp.

    Reviewed-on: https://go-review.googlesource.com/92735

From-SVN: r257463
---
 libgo/go/runtime/cpuprof.go     | 8 ++++----
 libgo/go/runtime/proc.go        | 4 +++-
 libgo/go/runtime/signal_unix.go | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go
index 91cdf2b..b1a7c3b 100644
--- a/libgo/go/runtime/cpuprof.go
+++ b/libgo/go/runtime/cpuprof.go
@@ -156,8 +156,8 @@ func (p *cpuProfile) addExtra() {
 	if p.lostExtra > 0 {
 		hdr := [1]uint64{p.lostExtra}
 		lostStk := [2]uintptr{
-			funcPC(_LostExternalCode) + sys.PCQuantum,
-			funcPC(_ExternalCode) + sys.PCQuantum,
+			_LostExternalCodePC + sys.PCQuantum,
+			_ExternalCodePC + sys.PCQuantum,
 		}
 		cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
 		p.lostExtra = 0
@@ -167,8 +167,8 @@ func (p *cpuProfile) addExtra() {
 func (p *cpuProfile) addLostAtomic64(count uint64) {
 	hdr := [1]uint64{count}
 	lostStk := [2]uintptr{
-		funcPC(_LostSIGPROFDuringAtomic64) + sys.PCQuantum,
-		funcPC(_System) + sys.PCQuantum,
+		_LostSIGPROFDuringAtomic64PC + sys.PCQuantum,
+		_SystemPC + sys.PCQuantum,
 	}
 	cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
 }
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 2972daa0..edf4140 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -3369,7 +3369,9 @@ var lostAtomic64Count uint64
 
 var _SystemPC = funcPC(_System)
 var _ExternalCodePC = funcPC(_ExternalCode)
+var _LostExternalCodePC = funcPC(_LostExternalCode)
 var _GCPC = funcPC(_GC)
+var _LostSIGPROFDuringAtomic64PC = funcPC(_LostSIGPROFDuringAtomic64)
 
 // Called if we receive a SIGPROF signal.
 // Called by the signal handler, may run during STW.
@@ -3469,7 +3471,7 @@ func sigprofNonGoPC(pc uintptr) {
 	if prof.hz != 0 {
 		stk := []uintptr{
 			pc,
-			funcPC(_ExternalCode) + sys.PCQuantum,
+			_ExternalCodePC + sys.PCQuantum,
 		}
 		cpuprof.addNonGo(stk)
 	}
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 02d5348..a8f77fa 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -245,7 +245,7 @@ func setProcessCPUProfiler(hz int32) {
 		// Enable the Go signal handler if not enabled.
 		if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
 			atomic.Storeuintptr(&fwdSig[_SIGPROF], getsig(_SIGPROF))
-			setsig(_SIGPROF, funcPC(sighandler))
+			setsig(_SIGPROF, getSigtramp())
 		}
 	} else {
 		// If the Go signal handler should be disabled by default,
-- 
cgit v1.1


From 9adab5dd169afd191efd1dcbf50dae0f726a0a42 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 8 Feb 2018 15:30:22 +0000
Subject: libgo: update to Go1.10rc2

    Reviewed-on: https://go-review.googlesource.com/92736

From-SVN: r257493
---
 libgo/go/runtime/crash_cgo_test.go                | 21 +++++++++++++++++
 libgo/go/runtime/testdata/testprogcgo/sigpanic.go | 28 +++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/sigpanic.go

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index 3532147..6688b3c 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -493,3 +493,24 @@ func TestSigStackSwapping(t *testing.T) {
 		t.Errorf("expected %q got %v", want, got)
 	}
 }
+
+func TestCgoTracebackSigpanic(t *testing.T) {
+	// Test unwinding over a sigpanic in C code without a C
+	// symbolizer. See issue #23576.
+	if runtime.GOOS == "windows" {
+		// On Windows if we get an exception in C code, we let
+		// the Windows exception handler unwind it, rather
+		// than injecting a sigpanic.
+		t.Skip("no sigpanic in C on windows")
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "TracebackSigpanic")
+	want := "runtime.sigpanic"
+	if !strings.Contains(got, want) {
+		t.Fatalf("want failure containing %q. output:\n%s\n", want, got)
+	}
+	nowant := "unexpected return pc"
+	if strings.Contains(got, nowant) {
+		t.Fatalf("failure incorrectly contains %q. output:\n%s\n", nowant, got)
+	}
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/sigpanic.go b/libgo/go/runtime/testdata/testprogcgo/sigpanic.go
new file mode 100644
index 0000000..cb46030
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/sigpanic.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// This program will crash.
+// We want to test unwinding from sigpanic into C code (without a C symbolizer).
+
+/*
+#cgo CFLAGS: -O0
+
+char *pnil;
+
+static int f1(void) {
+	*pnil = 0;
+	return 0;
+}
+*/
+import "C"
+
+func init() {
+	register("TracebackSigpanic", TracebackSigpanic)
+}
+
+func TracebackSigpanic() {
+	C.f1()
+}
-- 
cgit v1.1


From 74e6f14adb7057b29d361cc35c76f16663d1e649 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <ian@gcc.gnu.org>
Date: Thu, 8 Feb 2018 15:37:43 +0000
Subject: runtime: get missing function name from symbol table

    If we trace back through code that has no debug info, as when calling
    through C code compiled with -g0, we won't have a function name.
    Try to fetch the function name using the symbol table.

    Adding the test case revealed that gotest failed to use the gccgo tag
    when matching files, so add that.

    Reviewed-on: https://go-review.googlesource.com/92756

From-SVN: r257495
---
 libgo/go/runtime/crash_gccgo_test.go               | 59 ++++++++++++++++++++++
 .../testdata/testprogcgo/traceback_gccgo.go        | 40 +++++++++++++++
 libgo/go/runtime/testdata/testprogcxx/main.go      | 35 +++++++++++++
 libgo/go/runtime/testdata/testprogcxx/traceback.cc | 19 +++++++
 libgo/go/runtime/testdata/testprogcxx/traceback.go | 24 +++++++++
 5 files changed, 177 insertions(+)
 create mode 100644 libgo/go/runtime/crash_gccgo_test.go
 create mode 100644 libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
 create mode 100644 libgo/go/runtime/testdata/testprogcxx/main.go
 create mode 100644 libgo/go/runtime/testdata/testprogcxx/traceback.cc
 create mode 100644 libgo/go/runtime/testdata/testprogcxx/traceback.go

(limited to 'libgo/go/runtime')

diff --git a/libgo/go/runtime/crash_gccgo_test.go b/libgo/go/runtime/crash_gccgo_test.go
new file mode 100644
index 0000000..c216e54
--- /dev/null
+++ b/libgo/go/runtime/crash_gccgo_test.go
@@ -0,0 +1,59 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo,gccgo
+
+package runtime_test
+
+import (
+	"bytes"
+	"fmt"
+	"internal/testenv"
+	"os"
+	"os/exec"
+	"strings"
+	"testing"
+)
+
+func TestGccgoCrashTraceback(t *testing.T) {
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "CrashTracebackGccgo")
+	ok := true
+	for i := 1; i <= 3; i++ {
+		if !strings.Contains(got, fmt.Sprintf("CFunction%d", i)) {
+			t.Errorf("missing C function CFunction%d", i)
+			ok = false
+		}
+	}
+	if !ok {
+		t.Log(got)
+	}
+}
+
+func TestGccgoCrashTracebackNodebug(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	if os.Getenv("CC") == "" {
+		t.Skip("no compiler in environment")
+	}
+
+	cc := strings.Fields(os.Getenv("CC"))
+	cc = append(cc, "-x", "c++", "-")
+	out, _ := exec.Command(cc[0], cc[1:]...).CombinedOutput()
+	if bytes.Contains(out, []byte("error trying to exec 'cc1plus'")) {
+		t.Skip("no C++ compiler")
+	}
+	os.Setenv("CXX", os.Getenv("CC"))
+
+	got := runTestProg(t, "testprogcxx", "CrashTracebackNodebug")
+	ok := true
+	for i := 1; i <= 3; i++ {
+		if !strings.Contains(got, fmt.Sprintf("cxxFunction%d", i)) {
+			t.Errorf("missing C++ function cxxFunction%d", i)
+			ok = false
+		}
+	}
+	if !ok {
+		t.Log(got)
+	}
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go b/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
new file mode 100644
index 0000000..83357fd
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gccgo
+
+package main
+
+// This program will crash.
+// We want the stack trace to include the C functions.
+
+/*
+#cgo CFLAGS: -g -O0
+
+#include <stdint.h>
+
+char *p;
+
+static int CFunction3(void) {
+	*p = 0;
+	return 0;
+}
+
+static int CFunction2(void) {
+	return CFunction3();
+}
+
+static int CFunction1(void) {
+	return CFunction2();
+}
+*/
+import "C"
+
+func init() {
+	register("CrashTracebackGccgo", CrashTracebackGccgo)
+}
+
+func CrashTracebackGccgo() {
+	C.CFunction1()
+}
diff --git a/libgo/go/runtime/testdata/testprogcxx/main.go b/libgo/go/runtime/testdata/testprogcxx/main.go
new file mode 100644
index 0000000..0ecab10
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcxx/main.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "os"
+
+var cmds = map[string]func(){}
+
+func register(name string, f func()) {
+	if cmds[name] != nil {
+		panic("duplicate registration: " + name)
+	}
+	cmds[name] = f
+}
+
+func registerInit(name string, f func()) {
+	if len(os.Args) >= 2 && os.Args[1] == name {
+		f()
+	}
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		println("usage: " + os.Args[0] + " name-of-test")
+		return
+	}
+	f := cmds[os.Args[1]]
+	if f == nil {
+		println("unknown function: " + os.Args[1])
+		return
+	}
+	f()
+}
diff --git a/libgo/go/runtime/testdata/testprogcxx/traceback.cc b/libgo/go/runtime/testdata/testprogcxx/traceback.cc
new file mode 100644
index 0000000..d4bd26c
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcxx/traceback.cc
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+char *p;
+
+static int cxxFunction3() {
+	*p = 0;
+	return 0;
+}
+
+static int cxxFunction2() {
+	return cxxFunction3();
+}
+
+extern "C"
+int cxxFunction1() {
+	return cxxFunction2();
+}
diff --git a/libgo/go/runtime/testdata/testprogcxx/traceback.go b/libgo/go/runtime/testdata/testprogcxx/traceback.go
new file mode 100644
index 0000000..0e40ca8
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcxx/traceback.go
@@ -0,0 +1,24 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// This program will crash.
+// We want the stack trace to include the C++ functions,
+// even though we compile with -g0.
+
+/*
+#cgo CXXFLAGS: -g0 -O0
+
+extern int cxxFunction1(void);
+*/
+import "C"
+
+func init() {
+	register("CrashTracebackNodebug", CrashTracebackNodebug)
+}
+
+func CrashTracebackNodebug() {
+	C.cxxFunction1()
+}
-- 
cgit v1.1